From 069b05cf3dc947dda34e80b6c294d6ed600451fe Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:21:11 +0800 Subject: [PATCH 001/172] [TRTLLM-9706] [doc] Update wide EP documents (#9724) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md | 14 +- ...nt-guide-for-kimi-k2-thinking-on-trtllm.md | 15 + .../disaggregated/slurm/benchmark/README.md | 273 +++++++++++------- examples/wide_ep/README.md | 92 +++--- examples/wide_ep/slurm_scripts/README.md | 128 ++++++-- examples/wide_ep/slurm_scripts/config.yaml | 6 - 6 files changed, 361 insertions(+), 167 deletions(-) diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index da72ee5464..bbb276a6e9 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -30,7 +30,7 @@ In this blog, we share the configurations and procedures about how to reproduce - [Expected Result Format](#expected-result-format-3) - [Exploring more ISL/OSL combinations](#exploring-more-islosl-combinations) - [WIP: Enable more features by default](#wip-enable-more-features-by-default) - - [Not supported: MLA chunked context support on Hopper](#not-supported-mla-chunked-context-support-on-hopper) + - [MLA chunked context](#mla-chunked-context) - [Out of memory issues](#out-of-memory-issues) @@ -69,8 +69,11 @@ For NVIDIA Hopper GPUs, it's recommended to use the FP8 version of the DeepSeek YOUR_MODEL_PATH= cd $YOUR_MODEL_PATH -## Download FP4 model for Blackwell GPUs -git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4 +## Download NVFP4 model for Blackwell GPUs +git clone https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2 + +## Or the 0528 version +git clone https://huggingface.co/nvidia/DeepSeek-R1-0528-NVFP4-v2 ## Download FP8 model for Hopper GPUs ## FP8 model also works for Blackwell, but FP4 has the best performance on Blackwell. @@ -402,9 +405,10 @@ Average request latency (ms): 181540.5739 ## Exploring more ISL/OSL combinations To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use `prepare_dataset.py` to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother. + ### WIP: Enable more features by default -Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as CUDA graph, overlap scheduler and attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models. +Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models. Note that, `max_batch_size` and `max_num_tokens` can easily affect the performance. The default values for them are already carefully designed and should deliver good performance on overall cases, however, you may still need to tune it for peak performance. @@ -414,7 +418,7 @@ For more details on `max_batch_size` and `max_num_tokens`, refer to [Tuning Max ### MLA chunked context -MLA currently supports the chunked context feature on both Hopper and Blackwell GPUs. You can use `--enable_chunked_context` to enable it. This feature is primarily designed to reduce TPOT (Time Per Output Token). The default chunk size is set to `max_num_tokens`. If you want to achieve a lower TPOT, you can appropriately reduce the chunk size. However, please note that this will also decrease overall throughput. Therefore, a trade-off needs to be considered. +MLA currently supports the chunked context feature on both Hopper and Blackwell GPUs. You can use `--enable_chunked_context` to enable it. This feature is primarily designed to reduce TPOT (Time Per Output Token). The default chunk size is set to `max_num_tokens`. If you want to achieve a lower TPOT, you can appropriately reduce the chunk size. However, please note that this will also decrease overall throughput. Therefore, a trade-off needs to be considered. For more details on `max_num_tokens`, refer to [Tuning Max Batch Size and Max Num Tokens](../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md). diff --git a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md index d8ec17daff..391a72091d 100644 --- a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md @@ -306,3 +306,18 @@ Run `bench.sh` to begin a serving benchmark. ```shell ./bench.sh ``` + +## Troubleshooting + +Since Kimi K2 Thinking has larger weight size than other models, it's possible seeing host OOM issues, as the following: + +```log +Loading weights: 100%|█████████████████████| 1408/1408 [03:43<00:00, 6.30it/s] + 0: [12/04/2025-18:38:28] [TRT-LLM] [RANK 0] [I] moe_load_balancer finalizing model... + 1: [nvl72136-T14:452151:0:452151] Caught signal 7 (Bus error: nonexistent physical address) + 1: ==== backtrace (tid: 452151) ==== + 1: 0 /usr/local/ucx//lib/libucs.so.0(ucs_handle_error+0x2cc) [0xffff9638274c] + 1: 1 /usr/local/ucx//lib/libucs.so.0(+0x328fc) [0xffff963828fc] + 1: 2 /usr/local/ucx//lib/libucs.so.0(+0x32c78) [0xffff96382c78] +``` +This can be addressed by mounting `tmpfs:/dev/shm:size=640G` when launching the Docker container, to increase the shm size that the container can access. diff --git a/examples/disaggregated/slurm/benchmark/README.md b/examples/disaggregated/slurm/benchmark/README.md index 1a92039d93..4f70f6481c 100644 --- a/examples/disaggregated/slurm/benchmark/README.md +++ b/examples/disaggregated/slurm/benchmark/README.md @@ -6,12 +6,16 @@ This directory contains scripts to run disaggregated inference benchmarks using The benchmarking process is orchestrated through a combination of Python scripts and YAML configuration: -1. `config.yaml`: The main configuration file that defines all benchmark parameters including SLURM settings, hardware configuration, worker settings, and benchmark modes. -2. `disaggr_torch.slurm`: The SLURM script that sets up and runs a single benchmark experiment based on the YAML configuration. -3. Python scripts for configuration and execution: - - Worker configuration generation - - Server configuration generation - - Benchmark execution and metrics collection +1. **`submit.py`**: Main entry point for submitting benchmark jobs. Handles configuration validation, worker config generation, and SLURM job submission. +2. **`config.yaml`**: The main configuration file that defines all benchmark parameters including SLURM settings, hardware configuration, worker settings, and benchmark modes. +3. **`disaggr_torch.slurm`**: The SLURM batch script that sets up the container environment, initializes workers, and runs benchmarks. +4. **Supporting scripts**: + - `start_worker.sh`: Initializes context and generation workers + - `start_server.sh`: Starts the disaggregated serving coordinator + - `wait_server.sh`: Waits for server readiness before benchmarking + - `run_benchmark.sh` / `run_benchmark_nv_sa.sh`: Execute benchmark workloads + - `accuracy_eval.sh`: Runs accuracy evaluation using lm_eval + - `gen_server_config.py`: Generates server configuration from worker settings ## Configuration (config.yaml) @@ -25,105 +29,155 @@ slurm: account: "" job_time: "02:00:00" job_name: "" - numa_bind: true + extra_args: "" # Additional SLURM arguments (e.g., "--gres=gpu:4 --exclude=node1") + numa_bind: true # Enable NUMA binding for GB200 NVL72 ``` -### 2. Benchmark Mode +### 2. Benchmark Configuration ```yaml benchmark: - mode: "e2e" # Options: e2e, gen_only - use_nv_sa_benchmark: false - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true + mode: "e2e" # Options: e2e (end-to-end), gen_only (generation only) + use_nv_sa_benchmark: false # Use NVIDIA SA benchmark script + multi_round: 8 # Number of benchmark rounds + benchmark_ratio: 0.8 # Fraction of requests to benchmark + streaming: true # Enable streaming mode + concurrency_list: "16" # Comma-separated list of concurrency levels to test + input_length: 1024 # Input sequence length + output_length: 1024 # Output sequence length + dataset_file: "" # Path to dataset file ``` ### 3. Hardware Configuration ```yaml hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 + gpus_per_node: 4 # GPUs per node in your cluster + num_ctx_servers: 1 # Number of context processing servers + num_gen_servers: 1 # Number of generation servers ``` -### 4. Sequence Configuration -```yaml -sequence: - input_length: 1024 - output_length: 1024 -``` - -### 5. Environment Configuration +### 4. Environment Configuration ```yaml environment: container_mount: "" # Format: path1:path1,path2:path2 - container_image: "" - model_path: "" - trtllm_repo: "" - build_wheel: false - dataset_file: "" - work_dir: "" + container_image: "" # Path to TensorRT-LLM container + model_path: "" # Path to model checkpoint + trtllm_repo: "" # Path to TensorRT-LLM repository + build_wheel: false # Set true to build TensorRT-LLM from source + trtllm_wheel_path: "" # Path to pre-built wheel (if not building from source) + work_dir: "" # Working directory for outputs + worker_env_var: "TLLM_LOG_LEVEL=INFO ..." # Environment variables for workers + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" # Environment variables for server ``` -### 6. Worker Configuration +### 5. Worker Configuration The worker configuration section defines detailed settings for both context and generation workers: ```yaml worker_config: - concurrency_list: "16" - eplb_num_slots: 0 - mtp_size: 0 gen: - tensor_parallel_size: 16 - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 64 - enable_attention_dp: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 # For MoE models + enable_attention_dp: true # Enable attention data parallelism # Additional generation worker settings... + ctx: tensor_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 4608 + moe_expert_parallel_size: 4 enable_attention_dp: true # Additional context worker settings... ``` ## Running the Benchmark -The benchmark system now uses a more streamlined approach with configuration defined in YAML and execution handled by Python scripts. +The benchmark system uses a streamlined approach with configuration defined in YAML and execution handled by the `submit.py` Python script. + +### Prerequisites + +Before running benchmarks, ensure you have: + +1. **SLURM cluster access** with valid partition and account +2. **Container environment** with NVIDIA Container Toolkit configured +3. **Model checkpoint** files accessible from all cluster nodes +4. **Required device mappings** configured (e.g., `/dev/gdrdrv` for GDRCopy) +5. **Python 3** with PyYAML installed ### Step 1: Configure the Benchmark -Edit the `config.yaml` file to set up your benchmark parameters. The configuration is organized into logical sections: +Create or edit a configuration YAML file based on `config.yaml`. Update the following required fields: -1. SLURM settings (partition, account, time limits) -2. Hardware configuration (GPUs, server counts) -3. Benchmark parameters (mode, sequence lengths, streaming) -4. Environment settings (container, model paths) -5. Worker configurations (parallelism, batch sizes, memory settings) - -### Step 2: Launch the Benchmark - -The benchmark can be launched using the SLURM system: +1. **SLURM settings**: partition, account, job time limits +2. **Hardware configuration**: GPUs per node, server counts +3. **Benchmark parameters**: mode, sequence lengths, concurrency, streaming +4. **Environment settings**: container image and mount paths, model path, work directory +5. **Worker configurations**: parallelism settings, batch sizes, memory configurations +Example: ```bash -sbatch disaggr_torch.slurm +cp config.yaml my_benchmark.yaml +# Edit my_benchmark.yaml with your settings ``` -The SLURM script will: -1. Read and validate the YAML configuration -2. Set up the container environment -3. Configure and start the workers and servers -4. Execute the benchmark -5. Collect and store metrics +### Step 2: Submit the Benchmark Job + +Use the `submit.py` script to submit your benchmark job: + +```bash +# Submit a single configuration +python3 submit.py -c my_benchmark.yaml + +# Or submit multiple configurations from a directory +python3 submit.py -d ./configs/ +``` + +The submission script will: +1. Validate the YAML configuration +2. Calculate required nodes based on parallelism settings +3. Generate worker configuration files +4. Submit the SLURM job with appropriate parameters + +The SLURM job (via `disaggr_torch.slurm`) will then: +1. Start the container environment +2. Install or build TensorRT-LLM (if configured) +3. Launch context and generation workers +4. Start the disaggregated serving coordinator +5. Execute the benchmark workload +6. Run accuracy evaluation (if enabled) +7. Collect and store all metrics and logs + +### Monitoring and Results + +After submitting your job, you can monitor its progress: + +```bash +# Check job status +squeue -u $USER + +# View job output (replace with your SLURM job ID) +tail -f slurm-.out + +# Monitor worker logs in the work directory +ls ////logs/ +``` + +Results are automatically organized in the work directory: +``` +/ + └── / + └── -/ + └── ctx_gen_dep_batch_eplb_mtp/ + ├── logs/ + ├── ctx_config.yaml + ├── gen_config.yaml + ├── job_info.txt + └── bench.log +``` ### Benchmark Modes The system supports two primary benchmark modes: -1. **End-to-End (e2e)**: Tests the complete pipeline including both context and generation phases -2. **Generation Only (gen_only)**: Focuses on testing just the generation phase +1. **End-to-End (e2e)**: Tests the complete disaggregated inference pipeline including both context processing and token generation phases +2. **Generation Only (gen_only)**: Focuses solely on testing the generation phase with pre-cached KV data Configure the mode in the YAML file: ```yaml @@ -135,48 +189,73 @@ benchmark: The benchmark system collects various performance metrics: -- TTFT (Time to First Token) -- TPOT (Throughput Over Time) -- ITL (Inter-Token Latency) -- E2EL (End-to-End Latency) +- **TTFT** (Time to First Token): Latency from request submission to first token generation +- **TPOT** (Time Per Output Token): Average time to generate each token +- **ITL** (Inter-Token Latency): Latency between consecutive tokens +- **E2EL** (End-to-End Latency): Total request latency from input to completion +- **Throughput**: Requests per second and tokens per second -Metrics are automatically collected and stored in the work directory specified in the configuration. +Metrics are automatically collected from worker iteration logs and stored in the work directory. ### Advanced Features -1. **NVIDIA SA Benchmark Integration** - ```yaml - benchmark: - use_nv_sa_benchmark: true - ``` +#### 1. Accuracy Evaluation -2. **Profiling Support** - ```yaml - profiling: - nsys_on: true - ``` +Enable accuracy evaluation using the lm_eval framework: -3. **Custom Worker Settings** - The worker configuration section allows detailed customization of both context and generation workers, including: - - Tensor and pipeline parallelism - - Batch sizes and token limits - - Memory management - - Cache configuration - - MoE settings (if applicable) +```yaml +accuracy: + enable_accuracy_test: true + model: "local-completions" + tasks: "gsm8k,hellaswag,mmlu" # Comma-separated task list + model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096" +``` -4. **Container and Build Options** - ```yaml - environment: - build_wheel: true # Build TensorRT-LLM from source - container_mount: "path1:path1,path2:path2" - ``` +Accuracy results will be saved in `/accuracy_eval/` after benchmark completion. -### Output and Logs +#### 2. NVIDIA Nsight Systems Profiling -Benchmark results and logs are stored in the specified work directory, including: -- Performance metrics -- Worker and server logs -- Profiling data (if enabled) -- Error logs and diagnostics +Enable profiling to analyze performance bottlenecks: -The system automatically organizes outputs by benchmark run and configuration. +```yaml +profiling: + nsys_on: true + ctx_profile_range: "10-30" # Profile iterations 10-30 for context workers + gen_profile_range: "200-250" # Profile iterations 200-250 for generation workers +``` + +Profiling data (`.nsys-rep` files) will be saved in the log directory. + +#### 3. Batch Job Submission + +Submit multiple benchmark configurations at once: + +```bash +# Create a directory with multiple config files +mkdir -p ./configs +cp config.yaml ./configs/config1.yaml +cp config.yaml ./configs/config2.yaml +# Edit each config... + +# Submit all configurations +python3 submit.py -d ./configs/ +``` + +Each configuration will be submitted as a separate SLURM job. + +#### 4. Custom TensorRT-LLM Installation + +Build from source: +```yaml +environment: + trtllm_repo: "/path/to/TensorRT-LLM" + build_wheel: true # Builds wheel on one node +``` + +Or install from pre-built wheel: +```yaml +environment: + trtllm_wheel_path: "/path/to/tensorrt_llm-*.whl" + trtllm_repo: "" + build_wheel: false +``` diff --git a/examples/wide_ep/README.md b/examples/wide_ep/README.md index 9b9ea4e8db..a9b52cbe8a 100644 --- a/examples/wide_ep/README.md +++ b/examples/wide_ep/README.md @@ -4,7 +4,7 @@ TensorRT-LLM's Wide Expert Parallelism (Wide-EP) feature enables efficient infer ## Overview -Large-scale MoE models like DeepSeek-V3/R1, LLaMA4, and Qwen3 use fine-grained expert designs that introduce new challenges for inference systems: +Large-scale MoE models like DeepSeek-V3/R1, Kimi K2 Thinking, LLaMA4, and Qwen3 use fine-grained expert designs that introduce new challenges for inference systems: - **High memory demands** for expert weights - **Inherent expert-level workload imbalance** due to sparse execution patterns @@ -33,60 +33,66 @@ For more information on NVIDIA IMEX service for NVLink networks, refer to https: #### Coherent Driver-Based Memory Management (CDMM) -Starting from R580 Driver, [Coherent Driver-Based Memory Management (CDMM)](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-65-06/index.html#hardware-software-support) for GB200 platforms is introduced. With CDMM, the driver manages GPU memory instead of the OS. CDMM avoids OS onlining of the GPU memory and the exposing of the GPU memory as a NUMA node to the OS. In Wide-EP, online EPLB need host threads be able to access the GPU memory to do the weights update. +Starting from R580 Driver, [Coherent Driver-Based Memory Management (CDMM)](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-65-06/index.html#hardware-software-support) for GB200 platforms is introduced. With CDMM, the driver manages GPU memory instead of the OS. CDMM avoids OS onlining of the GPU memory and the exposing of the GPU memory as a NUMA node to the OS. In Wide-EP, online EPLB needs host threads to be able to access the GPU memory to do the weights update. -When CDMM mode is off, GPU memory are exposed as NUMA nodes, so no additional prerequisites is required. +When CDMM mode is off, GPU memory is exposed as NUMA nodes, so no additional prerequisites are required. -When CDMM mode is on, GPU memory doesn't exist in NUMA nodes, in that case, if online EPLB is needed, [GDRCopy](https://github.com/NVIDIA/gdrcopy?tab=readme-ov-file#build-and-installation) needs to be installed. +When CDMM mode is on, GPU memory doesn't exist in NUMA nodes. In that case, if online EPLB is needed, [GDRCopy](https://github.com/NVIDIA/gdrcopy?tab=readme-ov-file#build-and-installation) needs to be installed. When GDRCopy is installed and the kernel module is loaded, you should be able to see the device file `/dev/gdrdrv` and kernel module `gdrdrv` by `lsmod`. The device file needs to be mapped into the container. * For docker, this can be done by adding a device mapping like `--device=/dev/gdrdrv:/dev/gdrdrv`. * For slurm with enroot, `--container-mounts="/dev/gdrdrv:/dev/gdrdrv"` needs to be added when starting containers and environment variable `export ENROOT_ALLOW_DEV=yes` needs to be set. -### Configurations +### Online Load Balancer Configurations An example yaml file to enable wide EP: -```yaml -moe_config: - backend: WIDEEP - max_num_tokens: 9216 - load_balancer: moe_load_balancer.yaml # (optional) enable load balancer -``` - -| Parameter | Description | Default | Notes | -|-----------|-------------|---------|-------| -| `backend` | MoE backend type | `CUTLASS` | Set to `WIDEEP` to enable wide EP | -| `max_num_tokens` | If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. | `None` | If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used. | -| `load_balancer` | Configuration for MoE load balancing | `None` | Set path to the yaml file | - -#### Online Load Balancer Configuration - ```yaml moe_config: backend: WIDEEP max_num_tokens: 9216 load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + num_slots: 288 + layer_updates_per_iter: 1 # (optional) enable online load balancer ``` -| Parameter | Description | Default | Notes | -|-----------|-------------|---------|-------| -| `num_slots` | Total number of expert slots | `None` | Must be ≥ total experts | -| `layer_updates_per_iter` | Number of layers updated per iteration | `0` | `0` = offline, `>0` = online | +#### `backend` -#### Offline Load Balancer Configuration + - MoE backend type, defaults to `CUTLASS`. + - Currently, TensorRT LLM has multiple MoE backends that support wide EP, including `WIDEEP`, `CUTLASS`, `TRTLLM` and `CUTEDSL`. There are on-going efforts to refactor the backends so that we don't necessarily need a specific `WIDEEP` backend, and each other backend will support wide EP functionality. + +#### `max_num_tokens` + +If set, at most `max_num_tokens` tokens will be sent to `torch.ops.trtllm.fused_moe` at the same time. If the number of tokens exceeds `max_num_tokens`, the input tensors will be split into chunks and a for loop will be used. + +#### `load_balancer` + +Configuration for MoE load balancing, users can directly set `num_slots` and `layer_updates_per_iter` as online EPLB settings, while set path to a YAML file that also includes `initial_global_assignments` for offline EPLB. + +#### `num_slots` + +Total number of expert slots, must be ≥ total experts. Three typical settings: + +1. Set to 0. MoE load balancing is disabled. +2. Set to number of total experts, such as 256 for DeepSeek R1. +3. Set to number of total experts + EP size, such as 288 for DeepSeek R1, 32-way EP. + * This means there is 1 extra expert on each EP rank, so that there is more room for the per-rank token distribution to be more balanced. + +#### `layer_updates_per_iter` + +Number of layers updated per iteration, defaults to `0`. `0` means offline, while `>0` means online EPLB. + +### Offline Load Balancer Configuration Refer to the [Offline EP Load Balancer](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/ep_load_balancer#offline-ep-load-balancer) documentation. -*Online EP Load Balancer is more suitable for production deployment needs to react timely to the online traffic changes.* +*Note: Online EP Load Balancer is more suitable for production deployments that need to react timely to online traffic changes.* ### Execute Wide-EP on SLURM Clusters -Refer to the [slurm_scripts](./slurm_scripts/) directory, which reuses [disaggregated slurm scripts](../disaggregated/slurm/) to automatically generate configuration files and submit jobs to SLURM clusters. +Refer to the [slurm_scripts](./slurm_scripts/) directory, which reuses [disaggregated slurm scripts](../disaggregated/slurm/) for submitting jobs to SLURM clusters. -## Trouble shooting +## Troubleshooting ### Transparent HugePages failure @@ -104,7 +110,7 @@ echo madvise > /sys/kernel/mm/transparent_hugepage/enabled ### GB200 NUMA binding -GPU memory are also on NUMA nodes on GB200 and system can also use that. Bind memory to CPU nodes to avoid GPU memory being used as host memory. +GPU memory is also on NUMA nodes on GB200 and the system can also use that. Bind memory to CPU nodes to avoid GPU memory being used as host memory. ```bash numactl -m 0,1 ``` @@ -138,13 +144,20 @@ rm -f /dev/shm/moe_shared_l0_lr0_all **Warning:** Be careful when removing shared memory manually, as this may affect running processes that depend on these shared memory segments. -### Hang issue caused by `UnpicklingError` +### Host OOM -It's possible to see hang issue that is caused by an `UnpicklingError`, we've noticed that and recorded it as a known issue. The issue seems to be existing in MPI, because we are not reproducing again after by-passing the MPI route by implementing customized InfiniBand communicator and replacing MPI API calls with that. We did not proceed because: -1. The implementation only works on InfiniBand, hence not general enough. -2. The implementation largely duplicated with InfiniBand communicator implementation in NCCL, which is hard to maintain. +Since EPLB requires all experts to be loaded on host memory, when some models (such as Kimi K2 Thinking) have larger weights size, it's possible seeing host OOM issues, as the following: -That being said, we are aware of the `UnpicklingError`, but instead of pushing further, we decided to keep observing for a while to see if it would be gone with further 3rd-party dependency upgrade. Please let us know if it's a blocker in your workload, and we will do necessary adjustment based on the feedback. +```log +Loading weights: 100%|█████████████████████| 1408/1408 [03:43<00:00, 6.30it/s] + 0: [12/04/2025-18:38:28] [TRT-LLM] [RANK 0] [I] moe_load_balancer finalizing model... + 1: [nvl72136-T14:452151:0:452151] Caught signal 7 (Bus error: nonexistent physical address) + 1: ==== backtrace (tid: 452151) ==== + 1: 0 /usr/local/ucx//lib/libucs.so.0(ucs_handle_error+0x2cc) [0xffff9638274c] + 1: 1 /usr/local/ucx//lib/libucs.so.0(+0x328fc) [0xffff963828fc] + 1: 2 /usr/local/ucx//lib/libucs.so.0(+0x32c78) [0xffff96382c78] +``` +This can be addressed by mounting `tmpfs:/dev/shm:size=640G` when launching the Docker container, to increase the shm size that the container can access. ### Disaggregated serving related issues @@ -152,9 +165,14 @@ Refer to the [Troubleshooting and FAQ](https://github.com/NVIDIA/TensorRT-LLM/bl ## References -- Technical Blog: Scaling Expert Parallelism in TensorRT-LLM +To understand more details on wide EP and the optimizations we've added, refer to the technical blog series: Scaling Expert Parallelism in TensorRT-LLM - [Part 1: Design and Implementation of Large-scale EP](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md) - [Part 2: Performance Status and Optimization](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md) + - [Part 3: Pushing the Performance Boundary](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md) + +To review how wide EP helps with Blackwell's leading inference benchmarks, also read these recent blog posts: +* [NVIDIA Blackwell Leads on SemiAnalysis InferenceMAX™ v1 Benchmarks](https://developer.nvidia.com/blog/nvidia-blackwell-leads-on-new-semianalysis-inferencemax-benchmarks/) +* [NVIDIA Blackwell Raises Bar in New InferenceMAX Benchmarks, Delivering Unmatched Performance and Efficiency](https://blogs.nvidia.com/blog/blackwell-inferencemax-benchmark-results/) For detailed implementation examples and advanced usage, see the subdirectories: - [`ep_load_balancer/`](ep_load_balancer/): Load balancing tools and examples diff --git a/examples/wide_ep/slurm_scripts/README.md b/examples/wide_ep/slurm_scripts/README.md index 35dad32fcf..a3865035fe 100644 --- a/examples/wide_ep/slurm_scripts/README.md +++ b/examples/wide_ep/slurm_scripts/README.md @@ -1,40 +1,124 @@ -# TensorRT LLM Wide-EP Benchmark Scripts +# Wide-EP SLURM Benchmark Scripts -This directory contains scripts for benchmarking TensorRT LLM wide-ep performance using SLURM job scheduler. +This directory contains configuration files and utilities for benchmarking TensorRT-LLM Wide Expert Parallelism (Wide-EP) performance on SLURM-managed clusters. -## ⚠️ DISCLAIMER +## Overview -**These scripts are currently not QA'ed and are provided for demonstration purposes only.** +The Wide-EP benchmarking infrastructure leverages the [disaggregated serving benchmark framework](../../disaggregated/slurm/benchmark/) to evaluate MoE model performance with expert parallelism at scale. This directory provides: -Please note that: +- **Configuration templates** for Wide-EP deployments (`config.yaml`) +- **Post-processing utilities** for benchmark analysis (`process_gen_iterlog.py`) -- These scripts have not undergone formal quality assurance testing -- They are intended for demonstration and educational purposes -- Use at your own risk in production environments -- Always review and test scripts thoroughly before running in your specific environment +### Core Implementation -## Scripts Overview +The core SLURM submission and execution logic is implemented in [`examples/disaggregated/slurm/benchmark/`](../../disaggregated/slurm/benchmark/). The scripts in that directory handle: +- Job submission to SLURM clusters +- Multi-node distributed execution +- Worker initialization and coordination +- Benchmark execution and result collection -### Core Scripts +## Files in This Directory -Note that, core implementation of the slurm scripts are included in `examples/disaggregated/slurm/benchmark`. +### `config.yaml` -1. `process_gen_iterlog.py` - Processes benchmark results and generates reports +Example configuration file for Wide-EP benchmarks. Key sections include: + +- **SLURM Configuration**: Cluster-specific settings (partition, account, job parameters) +- **Benchmark Mode**: Testing parameters (concurrency, sequence lengths, streaming mode) +- **Hardware Configuration**: GPU topology and server counts +- **Environment**: Container images, model paths, and environment variables +- **Worker Configuration**: Detailed settings for generation and context workers, including: + - Parallelism settings (TP, EP, PP) + - MoE configuration with load balancer settings + - CUDA graph and KV cache configurations + - Speculative decoding parameters + +See the inline comments in [`config.yaml`](config.yaml) for detailed parameter descriptions. + +### `process_gen_iterlog.py` + +Post-processing script that analyzes benchmark iteration logs to generate performance reports. This script: +- Parses generation worker iteration logs +- Computes throughput and latency statistics +- Generates summary reports for benchmark results ## Usage ### Prerequisites -Before running the scripts, ensure you have: -- Access to a SLURM cluster -- Container image with TensorRT LLM installed -- Model files accessible on the cluster -- Required environment variables set +Before running benchmarks, ensure you have: -### Run Benchmarks +1. **SLURM Cluster Access**: Valid account and partition allocation +2. **Container Environment**: + - NVIDIA Container Toolkit configured + - Required device mappings (e.g., `/dev/nvidia-caps-imex-channels` for GB200, `/dev/gdrdrv` for GDRCopy) +3. **Model Files**: Checkpoint files accessible from all cluster nodes +4. **Configuration**: Updated `config.yaml` with your cluster-specific settings + +### Configuration Setup + +1. Copy and customize the example configuration: ```bash -# Please find the `submit.py` script in the `examples/disaggregated/slurm/benchmark/` directory. -# An example `config.yaml` for wide EP: `examples/wide_ep/slurm_scripts/config.yaml`. -python3 submit.py -c config.yaml +cp config.yaml my_benchmark_config.yaml +``` + +2. Update the following required fields in `my_benchmark_config.yaml`: + - `slurm.partition`: Your SLURM partition name + - `slurm.account`: Your SLURM account + - `environment.container_image`: Path to your TensorRT-LLM container + - `environment.model_path`: Path to your model checkpoint + - `environment.work_dir`: Working directory for benchmark outputs + - `environment.container_mount`: Mount paths for the container + +3. Adjust hardware configuration to match your setup: + - `hardware.gpus_per_node`: GPUs available per node + - `hardware.num_ctx_servers`: Number of context processing servers + - `hardware.num_gen_servers`: Number of generation servers + +### Running Benchmarks + +Submit a benchmark job using the `submit.py` script from the disaggregated benchmark directory: + +```bash +# Navigate to the benchmark submission directory +cd ../../disaggregated/slurm/benchmark/ + +# Submit the job with your configuration +python3 submit.py -c ../../../wide_ep/slurm_scripts/my_benchmark_config.yaml +``` + +The script will: +1. Validate your configuration +2. Submit a SLURM job with the specified parameters +3. Launch distributed workers across the allocated nodes +4. Execute the benchmark workload +5. Collect results in the specified working directory + +### Monitoring and Results + +After submission, monitor your job: + +```bash +# Check job status +squeue -u $USER + +# View job output (replace with your SLURM job ID) +tail -f slurm-.out + +# Check worker logs in the working directory +ls /logs/ +``` + +Benchmark results will be saved in your configured `work_dir`, including: +- Iteration logs from generation and context workers +- Performance metrics and throughput statistics +- System logs and error reports + +### Post-Processing Results + +Process generation iteration logs to extract performance metrics: + +```bash +python3 process_gen_iterlog.py ``` diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml index 12d83248bf..c019c0d29d 100644 --- a/examples/wide_ep/slurm_scripts/config.yaml +++ b/examples/wide_ep/slurm_scripts/config.yaml @@ -72,12 +72,6 @@ worker_config: - 32 - 64 - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - - 128 print_iter_log: true kv_cache_config: enable_block_reuse: false From d2327095689a694a8e141b19a4ababf3a2a722ea Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:40:29 +0800 Subject: [PATCH 002/172] [https://nvbugs/5666804][test] only adding sampler config for limited models (#9512) Signed-off-by: Ruodi Lu Co-authored-by: Ruodi Lu Co-authored-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: Larry Xu <197874197+LarryXFly@users.noreply.github.com> --- .../defs/perf/sampler_options_config.py | 20 ++++++++++++++----- tests/integration/defs/perf/test_perf.py | 9 +++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/integration/defs/perf/sampler_options_config.py b/tests/integration/defs/perf/sampler_options_config.py index 70a1ac97e8..26bab12cdd 100644 --- a/tests/integration/defs/perf/sampler_options_config.py +++ b/tests/integration/defs/perf/sampler_options_config.py @@ -26,9 +26,19 @@ def get_sampler_options_config(model_label: str) -> dict: Returns: dict: sampler options config """ - base_config = { - 'top_k': 4, - 'top_p': 0.5, - 'temperature': 0.5, - } + base_config = {} + if model_label in [ + 'llama_v3.1_70b_instruct-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:200,2000-reqs:64-con:200-gpus:8', + 'llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:8', + 'llama_v3.2_1b-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:2', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4', + 'llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-gpus:8', + 'llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-ep:8-gpus:8', + 'llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-ep:8-gpus:8', + 'mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:1000,2000-reqs:500-con:200-gpus:2', + 'phi_4_mini_instruct-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128' + ]: + base_config['top_k'] = 4 + base_config['top_p'] = 0.5 + base_config['temperature'] = 0.5 return base_config diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index c8cd559e4d..e6322c3221 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -2091,10 +2091,11 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): if not os.path.exists(sampler_options_path): os.makedirs(os.path.dirname(sampler_options_path), exist_ok=True) sampler_config = get_sampler_options_config(self._config.to_string()) - print_info(f"sampler options config: {sampler_config}") - with open(sampler_options_path, 'w') as f: - yaml.dump(sampler_config, f, default_flow_style=False) - benchmark_cmd += [f"--sampler_options={sampler_options_path}"] + if sampler_config: + print_info(f"sampler options config: {sampler_config}") + with open(sampler_options_path, 'w') as f: + yaml.dump(sampler_config, f, default_flow_style=False) + benchmark_cmd += [f"--sampler_options={sampler_options_path}"] return benchmark_cmd def get_commands(self): From 137713a8691a4112833b93be9222c54d94c87cab Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Mon, 8 Dec 2025 12:18:29 +0800 Subject: [PATCH 003/172] [None][infra] Waive failed cases for main on 12/08 (#9773) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2d7bfd20d7..23fde77e7c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -437,3 +437,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughp accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) +unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) +unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5649010) +disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653) +disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5722653) +disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5722653) +disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5722653) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) From 2f526583fb3ac3e86038785bbfa2bf801c767e53 Mon Sep 17 00:00:00 2001 From: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> Date: Mon, 8 Dec 2025 13:22:16 +0800 Subject: [PATCH 004/172] [None][chore] Move the rocketkv e2e test to post-merge (#9768) Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> --- tests/integration/defs/accuracy/references/longbench_v2.yaml | 2 +- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +- tests/integration/test_lists/test-db/l0_b200.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/accuracy/references/longbench_v2.yaml b/tests/integration/defs/accuracy/references/longbench_v2.yaml index eae407f35a..8f782aa481 100644 --- a/tests/integration/defs/accuracy/references/longbench_v2.yaml +++ b/tests/integration/defs/accuracy/references/longbench_v2.yaml @@ -8,5 +8,5 @@ DeepSeek-R1-0528: spec_dec_algo: MTP accuracy: 52.093 meta-llama/Llama-3.1-8B-Instruct: - - accuracy: 26.48 + - accuracy: 26.00 sigma: 25.8 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 09b1613f75..24bc65b5e1 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4674,7 +4674,7 @@ class TestStarcoder2_15B(LlmapiAccuracyTestHarness): @skip_pre_blackwell -class TestLlama3_1_8B_Instruct_LongBenchV2(LlmapiAccuracyTestHarness): +class TestLlama3_1_8B_Instruct_RocketKV(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct/" diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 4356e2601d..fd04f2028e 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -56,7 +56,6 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_LongBenchV2::test_auto_dtype - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551 - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] @@ -157,3 +156,4 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype From a422d70be6dd2717c804744faf38bafdfca923a2 Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Mon, 8 Dec 2025 13:28:11 +0800 Subject: [PATCH 005/172] [None][chore] Enable tvm_ffi for cute dsl nvfp4_gemm to reduce host overhead. (#9690) Signed-off-by: Mindy Li <11663212+limin2021@users.noreply.github.com> --- requirements.txt | 2 + .../_torch/custom_ops/cute_dsl_custom_ops.py | 165 +++++++++++++----- .../dense_blockscaled_gemm_persistent.py | 20 +-- .../_torch/thop/parallel/test_fp4_linear.py | 44 +++-- 4 files changed, 149 insertions(+), 82 deletions(-) diff --git a/requirements.txt b/requirements.txt index aaf2884f3d..e123aafcde 100644 --- a/requirements.txt +++ b/requirements.txt @@ -73,3 +73,5 @@ nvidia-cutlass-dsl==4.3.1; python_version >= "3.10" plotly numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing partial_json_parser +apache-tvm-ffi==0.1.4 # used for reduce nvidia-cutlass-dsl host overhead +torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py index 703dcc430a..897757cf2c 100644 --- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py @@ -240,7 +240,8 @@ if IS_CUTLASS_DSL_AVAILABLE: def __init__(self, output_dtype: torch.dtype, - to_userbuffers: bool = False): + to_userbuffers: bool = False, + use_tvm_ffi: bool = True): super().__init__() if output_dtype != torch.bfloat16: @@ -249,17 +250,19 @@ if IS_CUTLASS_DSL_AVAILABLE: ) self.output_dtype = output_dtype self.to_userbuffers = to_userbuffers + self.use_tvm_ffi = use_tvm_ffi def unique_id(self): - return (self.output_dtype, self.to_userbuffers) + return (self.output_dtype, self.to_userbuffers, self.use_tvm_ffi) def __hash__(self): - return hash((self.output_dtype, self.to_userbuffers)) + return hash( + (self.output_dtype, self.to_userbuffers, self.use_tvm_ffi)) def __eq__(self, other): if not isinstance(other, self.__class__): return False - return self.output_dtype == other.output_dtype and self.to_userbuffers == other.to_userbuffers + return self.output_dtype == other.output_dtype and self.to_userbuffers == other.to_userbuffers and self.use_tvm_ffi == other.use_tvm_ffi def get_valid_tactics( self, @@ -464,51 +467,94 @@ if IS_CUTLASS_DSL_AVAILABLE: f"CuteDSL: weight scale factor size mismatch. " f"Expected {expected_b_sf_size} (sf_n={sf_n} * sf_k={sf_k}), " f"got {b_sf_tensor.numel()} for shape N={n}, K={real_k}") + if alpha_tensor.numel() != 1: + raise ValueError(f"CuteDSL: alpha size mismatch. " + f"Expected 1, got {alpha_tensor.numel()}") # Reshape to CuteDSL's expected format (just a view, no copy) a_sf_tensor = a_sf_tensor.reshape(sf_m * sf_k) b_sf_tensor = b_sf_tensor.reshape(sf_n * sf_k) - a_ptr = self.make_cute_dsl_global_pointer(a_tensor, - cutlass.Float4E2M1FN, 32) - b_ptr = self.make_cute_dsl_global_pointer(b_tensor, - cutlass.Float4E2M1FN, 32) - a_sf_ptr = self.make_cute_dsl_global_pointer( - a_sf_tensor, cutlass.Float8E4M3FN, 16) - b_sf_ptr = self.make_cute_dsl_global_pointer( - b_sf_tensor, cutlass.Float8E4M3FN, 16) - c_ptr = self.make_cute_dsl_global_pointer(c_tensor, - cutlass.BFloat16, 16) - # Create pointer to alpha on device - alpha_ptr = self.make_cute_dsl_global_pointer( - alpha_tensor, cutlass.Float32, 4) + if not self.use_tvm_ffi: + a_ptr = self.make_cute_dsl_global_pointer( + a_tensor, cutlass.Float4E2M1FN, 32) + b_ptr = self.make_cute_dsl_global_pointer( + b_tensor, cutlass.Float4E2M1FN, 32) + a_sf_ptr = self.make_cute_dsl_global_pointer( + a_sf_tensor, cutlass.Float8E4M3FN, 16) + b_sf_ptr = self.make_cute_dsl_global_pointer( + b_sf_tensor, cutlass.Float8E4M3FN, 16) + c_ptr = self.make_cute_dsl_global_pointer( + c_tensor, cutlass.BFloat16, 16) + alpha_cute_tensor = cute.runtime.from_dlpack(alpha_tensor) - # get stream - torch_stream = torch.cuda.current_stream() - stream = cuda.CUstream(torch_stream.cuda_stream) + # get stream + torch_stream = torch.cuda.current_stream() + stream = cuda.CUstream(torch_stream.cuda_stream) cache_key = (sf_vec_size, mma_tiler_mn, cluster_shape_mn, swap_ab, use_prefetch) if swap_ab: - kernel_a_ptr = b_ptr - kernel_a_sf_ptr = b_sf_ptr - kernel_b_ptr = a_ptr - kernel_b_sf_ptr = a_sf_ptr kernel_m = n kernel_n = m kernel_sf_m = sf_n kernel_sf_n = sf_m + + kernel_a_tensor = b_tensor + kernel_a_sf_tensor = b_sf_tensor + kernel_b_tensor = a_tensor + kernel_b_sf_tensor = a_sf_tensor + + if not self.use_tvm_ffi: + kernel_a_ptr = b_ptr + kernel_a_sf_ptr = b_sf_ptr + kernel_b_ptr = a_ptr + kernel_b_sf_ptr = a_sf_ptr else: - kernel_a_ptr = a_ptr - kernel_a_sf_ptr = a_sf_ptr - kernel_b_ptr = b_ptr - kernel_b_sf_ptr = b_sf_ptr kernel_m = m kernel_n = n kernel_sf_m = sf_m kernel_sf_n = sf_n + kernel_a_tensor = a_tensor + kernel_a_sf_tensor = a_sf_tensor + kernel_b_tensor = b_tensor + kernel_b_sf_tensor = b_sf_tensor + + if not self.use_tvm_ffi: + kernel_a_ptr = a_ptr + kernel_a_sf_ptr = a_sf_ptr + kernel_b_ptr = b_ptr + kernel_b_sf_ptr = b_sf_ptr + if cache_key not in self.__class__.kernel_cache: + if self.use_tvm_ffi: + a_ptr = self.make_cute_dsl_global_pointer( + a_tensor, cutlass.Float4E2M1FN, 32) + b_ptr = self.make_cute_dsl_global_pointer( + b_tensor, cutlass.Float4E2M1FN, 32) + a_sf_ptr = self.make_cute_dsl_global_pointer( + a_sf_tensor, cutlass.Float8E4M3FN, 16) + b_sf_ptr = self.make_cute_dsl_global_pointer( + b_sf_tensor, cutlass.Float8E4M3FN, 16) + c_ptr = self.make_cute_dsl_global_pointer( + c_tensor, cutlass.BFloat16, 16) + alpha_cute_tensor = cute.runtime.from_dlpack(alpha_tensor) + # make faked stream + stream = cute.runtime.make_fake_stream( + use_tvm_ffi_env_stream=True) + + if swap_ab: + kernel_a_ptr = b_ptr + kernel_a_sf_ptr = b_sf_ptr + kernel_b_ptr = a_ptr + kernel_b_sf_ptr = a_sf_ptr + else: + kernel_a_ptr = a_ptr + kernel_a_sf_ptr = a_sf_ptr + kernel_b_ptr = b_ptr + kernel_b_sf_ptr = b_sf_ptr + gemm = self.__class__.kernel_class( sf_vec_size, mma_tiler_mn, @@ -520,6 +566,8 @@ if IS_CUTLASS_DSL_AVAILABLE: max_active_clusters = hardware_info.get_max_active_clusters( cluster_shape_mn[0] * cluster_shape_mn[1]) + # Note: when tvm_ffi fake stream is used, at least one parameter shoube be tensor type, + # so we make alpha as the cute.Tensor type in the jit func. compiled_gemm = cute.compile( gemm.wrapper, kernel_m, @@ -528,17 +576,18 @@ if IS_CUTLASS_DSL_AVAILABLE: kernel_sf_m // 128, kernel_sf_n // 128, sf_k // 4, - 1, + 1, # batch kernel_a_ptr, kernel_b_ptr, kernel_a_sf_ptr, kernel_b_sf_ptr, c_ptr, - alpha_ptr, # Pass alpha as device pointer + alpha_cute_tensor, max_active_clusters, stream, swap_ab, - options=f"--opt-level 2", + options=f"--opt-level 2 --enable-tvm-ffi" + if self.use_tvm_ffi else "--opt-level 2", ) self.__class__.kernel_cache[cache_key] = compiled_gemm @@ -546,21 +595,39 @@ if IS_CUTLASS_DSL_AVAILABLE: compiled_gemm = self.__class__.kernel_cache[cache_key] # launch gemm kernel - compiled_gemm( - kernel_m, - kernel_n, - real_k, - kernel_sf_m // 128, - kernel_sf_n // 128, - sf_k // 4, - kernel_a_ptr, - kernel_b_ptr, - kernel_a_sf_ptr, - kernel_b_sf_ptr, - c_ptr, - alpha_ptr, # Pass alpha as device pointer - stream, - ) + if self.use_tvm_ffi: + # call with torch pointer types and no need to pass stream. + compiled_gemm( + kernel_m, + kernel_n, + real_k, + kernel_sf_m // 128, + kernel_sf_n // 128, + sf_k // 4, + kernel_a_tensor.data_ptr(), + kernel_b_tensor.data_ptr(), + kernel_a_sf_tensor.data_ptr(), + kernel_b_sf_tensor.data_ptr(), + c_tensor.data_ptr(), + alpha_tensor, + ) + else: + # call with cute types and need to pass torch stream. + compiled_gemm( + kernel_m, + kernel_n, + real_k, + kernel_sf_m // 128, + kernel_sf_n // 128, + sf_k // 4, + kernel_a_ptr, + kernel_b_ptr, + kernel_a_sf_ptr, + kernel_b_sf_ptr, + c_ptr, + alpha_cute_tensor, + stream, + ) if swap_ab: c_tensor = c_tensor.permute(1, 0) @@ -578,6 +645,7 @@ if IS_CUTLASS_DSL_AVAILABLE: alpha: torch.Tensor, output_dtype: torch.dtype, to_userbuffers: bool = False, + use_tvm_ffi: bool = True, ) -> torch.Tensor: """CuteDSL-based NVFP4 GEMM optimized for Blackwell. @@ -589,6 +657,7 @@ if IS_CUTLASS_DSL_AVAILABLE: alpha: Scaling factor output_dtype: Output data type (must be bfloat16) to_userbuffers: Whether to allocate output from UserBuffers pool + use_tvm_ffi: Whether to use TVM-FFI to call the kernel. Enable this option could help reduce the kernel host launch overhead. Note: This function is primarily used internally by nvfp4_gemm. @@ -604,7 +673,8 @@ if IS_CUTLASS_DSL_AVAILABLE: tuner = AutoTuner.get() - runner = CuteDSLNVFP4BlackwellLinear(output_dtype, to_userbuffers) + runner = CuteDSLNVFP4BlackwellLinear(output_dtype, to_userbuffers, + use_tvm_ffi) inputs = [input, weight, input_scale, weight_scale, alpha] _, best_tactic = tuner.choose_one( "trtllm::cute_dsl_nvfp4_gemm_blackwell", @@ -625,6 +695,7 @@ if IS_CUTLASS_DSL_AVAILABLE: alpha: torch.Tensor, # Match custom op signature output_dtype: torch.dtype, to_userbuffers: bool = False, + use_tvm_ffi: bool = True, ): # [m, k] shape = list(mat_a.shape) diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py index 6b6b427edc..44edab9b3f 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py @@ -2017,20 +2017,19 @@ class Sm100BlockScaledPersistentDenseGemmKernel: @cute.jit def wrapper( self, - m, - n, - k, - sf_m, - sf_n, - sf_k, + m: cutlass.Int32, + n: cutlass.Int32, + k: cutlass.Int32, + sf_m: cutlass.Int32, + sf_n: cutlass.Int32, + sf_k: cutlass.Int32, l: cutlass.Constexpr, a_ptr: cute.Pointer, b_ptr: cute.Pointer, a_sf_ptr: cute.Pointer, b_sf_ptr: cute.Pointer, c_ptr: cute.Pointer, - alpha: cute. - Pointer, # Device pointer to alpha, will be converted to Tensor + alpha_tensor: cute.Tensor, max_active_clusters: cutlass.Constexpr, current_stream: cuda.CUstream, swap_ab: cutlass.Constexpr = False, @@ -2051,7 +2050,7 @@ class Sm100BlockScaledPersistentDenseGemmKernel: a_sf_ptr (cute.Pointer): Pointer to the scale factor tensor for A. b_sf_ptr (cute.Pointer): Pointer to the scale factor tensor for B. c_ptr (cute.Pointer): Pointer to the C tensor. - alpha (cute.Pointer): Device pointer to alpha scaling factor (converted to Tensor internally). + alpha_tensor (cute.Tensor): Device tensor to alpha scaling factor. max_active_clusters (cutlass.Constexpr): Maximum number of active clusters. current_stream (cuda.CUstream): CUDA stream for the operation. @@ -2096,9 +2095,6 @@ class Sm100BlockScaledPersistentDenseGemmKernel: (32, 4, sf_n, 4, sf_k, l), order=(2, 1, 4, 0, 3, 5), )) - alpha_tensor = cute.make_tensor(alpha, - layout=cute.make_ordered_layout( - (1, ), order=(0, ))) self(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, alpha_tensor, max_active_clusters, current_stream, epilogue_op) diff --git a/tests/unittest/_torch/thop/parallel/test_fp4_linear.py b/tests/unittest/_torch/thop/parallel/test_fp4_linear.py index cc61e07515..85eff74cee 100644 --- a/tests/unittest/_torch/thop/parallel/test_fp4_linear.py +++ b/tests/unittest/_torch/thop/parallel/test_fp4_linear.py @@ -313,15 +313,17 @@ def nvfp4_gemm_perf_test( x_sf_block_list = [x_sf_block] w_sf_block_list = [w_sf_block] + alpha_tensor = torch.tensor([1.0]).cuda() with torch.inference_mode(), autotune(): with nvtx.annotate( f"cute_dsl tune, m={SEQ_LEN}, k={HIDDEN_SIZE}, n={OUTPUT_SIZE}", color="orange", ): output = torch.ops.trtllm.cute_dsl_nvfp4_gemm_blackwell( - x_fp4, w_fp4, x_sf_block, w_sf_block, 1.0, dtype) + x_fp4, w_fp4, x_sf_block, w_sf_block, alpha_tensor, dtype) + from tensorrt_llm._torch.autotuner import AutoTuner + AutoTuner.get().print_statistics() - alpha_tensor = torch.tensor(1.0).cuda() if test_ref: with nvtx.annotate( f"ref tune, m={SEQ_LEN}, k={HIDDEN_SIZE}, n={OUTPUT_SIZE}", @@ -342,7 +344,7 @@ def nvfp4_gemm_perf_test( w_fp4_list[buffer_idx % workspace_count], x_sf_block_list[buffer_idx % workspace_count], w_sf_block_list[buffer_idx % workspace_count], - 1.0, + alpha_tensor, dtype, ) buffer_idx = buffer_idx + 1 @@ -356,7 +358,7 @@ def nvfp4_gemm_perf_test( w_fp4_list[buffer_idx % workspace_count], x_sf_block_list[buffer_idx % workspace_count], w_sf_block_list[buffer_idx % workspace_count], - 1.0, + alpha_tensor, dtype, ) buffer_idx = buffer_idx + 1 @@ -457,7 +459,7 @@ def test_nvfp4_gemm_unified_all_tactics(dtype, mnk): x_fp4, x_sf_block = torch.ops.trtllm.fp4_quantize( x, x_sf_global, scaling_vector_size, False) alpha_ref = 1.0 / (w_sf_global * x_sf_global) - alpha_tensor = torch.tensor(alpha_ref, dtype=torch.float32).cuda() + alpha_tensor = torch.tensor([alpha_ref], dtype=torch.float32).cuda() # Reference: Use CUTLASS backend explicitly for reference output with torch.inference_mode(): @@ -749,23 +751,19 @@ def test_fp4_linear_cuda_core(dtype, mnk): if __name__ == "__main__": # m, n, k - fp4_linear_perf_test(torch.bfloat16, 128, 7168, 16384) - fp4_linear_perf_test(torch.bfloat16, 128, 24576, 1536) - fp4_linear_perf_test(torch.bfloat16, 128, 2112, 7168) - fp4_linear_perf_test(torch.bfloat16, 128, 4096, 7168) - fp4_linear_perf_test(torch.bfloat16, 128, 7168, 2048) + nvfp4_gemm_perf_test(torch.bfloat16, 128, 7168, 16384) - # group-1 test cases - for tokens in [128, 8192]: - nvfp4_gemm_perf_test(torch.bfloat16, tokens, 7168, 16384) - nvfp4_gemm_perf_test(torch.bfloat16, tokens, 24576, 1536) - nvfp4_gemm_perf_test(torch.bfloat16, tokens, 2112, 7168) - nvfp4_gemm_perf_test(torch.bfloat16, tokens, 4096, 7168) - nvfp4_gemm_perf_test(torch.bfloat16, tokens, 7168, 2048) + # # group-1 test cases + # for tokens in [128, 8192]: + # nvfp4_gemm_perf_test(torch.bfloat16, tokens, 7168, 16384) + # nvfp4_gemm_perf_test(torch.bfloat16, tokens, 24576, 1536) + # nvfp4_gemm_perf_test(torch.bfloat16, tokens, 2112, 7168) + # nvfp4_gemm_perf_test(torch.bfloat16, tokens, 4096, 7168) + # nvfp4_gemm_perf_test(torch.bfloat16, tokens, 7168, 2048) - # group-2 test cases - for m in [128, 256, 512]: - nvfp4_gemm_perf_test(torch.bfloat16, m, 131584, 7168) - nvfp4_gemm_perf_test(torch.bfloat16, m, 7168, 65792) - nvfp4_gemm_perf_test(torch.bfloat16, m, 227368, 2560, test_ref=False) - nvfp4_gemm_perf_test(torch.bfloat16, m, 2560, 113664) + # # group-2 test cases + # for m in [128, 256, 512]: + # nvfp4_gemm_perf_test(torch.bfloat16, m, 131584, 7168) + # nvfp4_gemm_perf_test(torch.bfloat16, m, 7168, 65792) + # nvfp4_gemm_perf_test(torch.bfloat16, m, 227368, 2560, test_ref=False) + # nvfp4_gemm_perf_test(torch.bfloat16, m, 2560, 113664) From 448bb1a44fc88dacb42cb06d9baa7bc29d4a60cc Mon Sep 17 00:00:00 2001 From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Date: Mon, 8 Dec 2025 13:39:12 +0800 Subject: [PATCH 006/172] =?UTF-8?q?[TRTLLM-9431][perf]=20Enable=20multistr?= =?UTF-8?q?eam=20for=20Linear=20Attention=20in=20Qwen3-=E2=80=A6=20(#9696)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> --- .../_torch/models/modeling_qwen3_next.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py index e8b2021fb6..8061be539e 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py @@ -647,11 +647,10 @@ def fused_gdn_gating( class Qwen3NextGatedDeltaNet(nn.Module): - def __init__( - self, - model_config: ModelConfig[Qwen3NextConfig], - layer_idx: Optional[int] = None, - ): + def __init__(self, + model_config: ModelConfig[Qwen3NextConfig], + aux_stream: torch.cuda.Stream, + layer_idx: Optional[int] = None): super().__init__() config = model_config.pretrained_config self.model_config = model_config @@ -778,6 +777,12 @@ class Qwen3NextGatedDeltaNet(nn.Module): force_dynamic_quantization=model_config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=False) + self.event_dict = { + key: torch.cuda.Event() + for key in [EventType.Main, EventType.Attention] + } + self.aux_stream = aux_stream + def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba): """ Derives `query`, `key` and `value` tensors from `mixed_qkvzba`. @@ -1032,8 +1037,19 @@ class Qwen3NextGatedDeltaNet(nn.Module): ssm_states[state_indices_p] = 0 # conv_states[state_indices_p] = 0 # not necessary - projected_states_qkvz = self.in_proj_qkvz(hidden_states) - projected_states_ba = self.in_proj_ba(hidden_states) + def _compute_projected_states_qkvz(): + return self.in_proj_qkvz(hidden_states) + + def _compute_projected_states_ba(): + return self.in_proj_ba(hidden_states) + + projected_states_qkvz, projected_states_ba = maybe_execute_in_parallel( + _compute_projected_states_qkvz, + _compute_projected_states_ba, + self.event_dict[EventType.Main], + self.event_dict[EventType.Attention], + self.aux_stream, + ) # Use fused kernel when possible to avoid elementwise ops if self.num_v_heads // self.num_k_heads in [1, 2, @@ -1098,7 +1114,8 @@ class Qwen3NextLinearDecoderLayer(nn.Module): super().__init__() self.model_config = model_config config = model_config.pretrained_config - self.linear_attn = Qwen3NextGatedDeltaNet(model_config, layer_idx) + self.linear_attn = Qwen3NextGatedDeltaNet(model_config, aux_stream, + layer_idx) self.mapping = model_config.mapping self.enable_attention_dp = self.mapping.enable_attention_dp From 3f55c072234921aeeb6260310b76cbb62ff11930 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:51:55 +0800 Subject: [PATCH 007/172] [None][chore] Remove closed bugs (#9770) Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 23fde77e7c..c90ff83e2a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -301,7 +301,6 @@ full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search S full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP (https://nvbugs/5568052) full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337) accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847) -test_e2e.py::test_ptp_quickstart_multimodal_multiturn[gemma-3-27b-it-gemma/gemma-3-27b-it] SKIP (https://nvbugs/5568836) unittest/llmapi/test_llm_pytorch.py::test_llm_capture_request_error SKIP (https://nvbugs/5599176) examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143) unittest/llmapi/test_memory_profiling.py SKIP (https://nvbugs/5580781) @@ -312,11 +311,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SK examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313) accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5569696) accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5569696) -test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5596377) triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369) accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5582258) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5587393) accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233) examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False] SKIP (https://nvbugs/5629791) From e7395c660781a06754c863405938b01f48a6d22c Mon Sep 17 00:00:00 2001 From: Zheng Duan <200704041+zhengd-nv@users.noreply.github.com> Date: Mon, 8 Dec 2025 16:56:40 +0800 Subject: [PATCH 008/172] [None][infra] update mooncake in docker images (#9584) Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com> Signed-off-by: Zheng Duan <200704041+zhengd-nv@users.noreply.github.com> --- docker/common/install_mooncake.sh | 5 +++-- jenkins/current_image_tags.properties | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docker/common/install_mooncake.sh b/docker/common/install_mooncake.sh index 15301ba0fc..badd5f0eb6 100644 --- a/docker/common/install_mooncake.sh +++ b/docker/common/install_mooncake.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -MOONCAKE_VERSION="v0.3.6.post1" +MOONCAKE_VERSION="v0.3.7.post2" MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git" MOONCAKE_INSTALL_PATH="/usr/local/Mooncake" @@ -42,7 +42,8 @@ tar -czf /third-party-source/Mooncake-${MOONCAKE_VERSION}.tar.gz Mooncake cd Mooncake git submodule update --init --recursive --depth 1 mkdir build && cd build -cmake .. -DUSE_CUDA=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=${MOONCAKE_INSTALL_PATH} +cmake .. -DUSE_CUDA=ON -DBUILD_SHARED_LIBS=ON -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF \ + -DCMAKE_INSTALL_PREFIX=${MOONCAKE_INSTALL_PATH} make -j make install cd ../.. diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 0061d0be7e..a7ae94d2d3 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512041415-9225 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512041415-9225 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512041415-9225 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512041415-9225 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512081220-9584 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512081220-9584 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512081220-9584 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512081220-9584 From ededeecb0f71489a80f29cfc7bf7489752a363ab Mon Sep 17 00:00:00 2001 From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com> Date: Mon, 8 Dec 2025 17:25:07 +0800 Subject: [PATCH 009/172] [None][test] Add Kimi k2 WIDEEP perf and accuracy cases (#9686) Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../defs/perf/disagg/compare_backends.py | 5 + .../integration/defs/perf/disagg/envs/ENV.md | 10 +- .../perf/disagg/execution/subprocess_utils.py | 2 - ...1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml | 2 +- ...x1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml | 2 +- ...x1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml | 2 +- ...tx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml | 2 +- ...x1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml | 2 +- ...tx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml | 2 +- ..._gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml | 2 +- ...2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml | 2 +- ...x1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml | 2 +- ...tx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml | 2 +- ...x1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml | 2 +- ...tx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml | 2 +- ...6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml | 2 +- ...x6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml | 2 +- ...8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml | 2 +- ...x8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml | 2 +- ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 4 +- ...en1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml | 118 ++++++++++++++++++ ...gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml | 2 +- ..._gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml | 2 +- ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 2 +- ..._gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml | 2 +- ...en1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml | 2 +- ...gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml | 2 +- ...gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 4 +- ..._gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml | 4 +- ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 4 +- ...gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml | 4 +- ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 4 +- ..._dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml | 4 +- ...gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml | 4 +- ..._gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml | 4 +- ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 4 +- ..._gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml | 4 +- ...en1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml | 112 +++++++++++++++++ ...gen1_dep32_bs256_eplb416_mtp0_ccb-UCX.yaml | 112 +++++++++++++++++ .../perf/disagg/testlist/disagg_gb300.txt | 2 + .../defs/perf/disagg/testlist/wideep.txt | 3 + .../defs/perf/disagg/utils/common.py | 19 ++- .../defs/perf/disagg/utils/config_loader.py | 28 +++-- .../perf/disagg/utils/config_validator.py | 2 +- 44 files changed, 438 insertions(+), 63 deletions(-) create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-UCX.yaml create mode 100644 tests/integration/defs/perf/disagg/testlist/disagg_gb300.txt diff --git a/tests/integration/defs/perf/disagg/compare_backends.py b/tests/integration/defs/perf/disagg/compare_backends.py index c1a9ed541b..d6fb84e4cb 100644 --- a/tests/integration/defs/perf/disagg/compare_backends.py +++ b/tests/integration/defs/perf/disagg/compare_backends.py @@ -2,6 +2,7 @@ """Compare performance test results between different backends (UCX vs NIXL).""" import argparse +import os import re import sys @@ -44,6 +45,10 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): Returns: DataFrame: Comparison results """ + if not os.path.exists(csv_path): + print(f"CSV file not found: {csv_path}") + sys.exit(0) + # Read CSV file df = pd.read_csv(csv_path) diff --git a/tests/integration/defs/perf/disagg/envs/ENV.md b/tests/integration/defs/perf/disagg/envs/ENV.md index 997fc15165..5d1f7320c9 100644 --- a/tests/integration/defs/perf/disagg/envs/ENV.md +++ b/tests/integration/defs/perf/disagg/envs/ENV.md @@ -15,7 +15,8 @@ export TRTLLM_WHEEL_PATH="" export GPU_TYPE="" export SLURM_PARTITION="" export SLURM_ACCOUNT="" -export MODEL_DIR="" +export MODEL_DIR="" +export DATASET_DIR="" export OUTPUT_PATH="" export PATH="" export XDG_CACHE_HOME="" @@ -70,10 +71,15 @@ SLURM account name for job billing and resource allocation. - **Example**: `your_project_account` ### `MODEL_DIR` -Base directory containing models and datasets. This path will be used to locate model checkpoints and dataset files. +Base directory containing models. This path will be used to locate model checkpoints. - **Format**: Absolute path - **Example**: `/shared/models/common` +### `DATASET_DIR` +Base directory containing dataset files. This path will be used to locate dataset files. +- **Format**: Absolute path +- **Example**: `/shared/datasets/common` + ### `OUTPUT_PATH` Directory where test results, HTML reports, and CSV files will be saved. - **Format**: Absolute path diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py index 7034254ee0..9ab7771426 100644 --- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py +++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py @@ -56,10 +56,8 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs) check=True, **kwargs, ) - # Log stderr if it exists if result.stderr: stderr_output = result.stderr.decode() logger.error(f"Command stderr: {stderr_output}") - return result.stdout.decode() diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml index 33ee191ffd..90a198897b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml index 12ac8edad0..120fc40b3c 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index ab5bd6f719..6a4f5f5ddf 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index 7d8cb97621..e8f1b31a41 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml index 3f9a7d6a2d..2f9d1ad7c8 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml index f2fd2bc21d..e60204a562 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml index 5d9d739d58..a307a87f17 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml index f97137297b..d44c4d51e0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml index 6b9078ac5a..05c6794dd6 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml index 468354c073..10aa98c4b3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index a970ee6de4..64dd806fa6 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index 22dc90a06b..b0b7313226 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml index a54b0dacd5..796fdbd874 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml index ab081e78cf..4a45880f14 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml index f4a5d3bc3a..bc46d9fea3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml index 9388365383..c397316b35 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -1,7 +1,7 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index 1eaf479dcc..5de651526e 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 0 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json accuracy: datasets: - dataset_name: gsm8k diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..4cbcd13dd5 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml @@ -0,0 +1,118 @@ +metadata: + model_name: kimi-k2-thinking-fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 6 + dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json + accuracy: + datasets: + - dataset_name: gsm8k + expected_value: 0.9454 + threshold_type: hypothesis_test + filter_type: flexible-extract +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 00:45:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 1.0 + streaming: true + concurrency_list: '16384' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: true + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 5120 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 1024 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + load_balancer: + num_slots: 384 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + trust_remote_code: true + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 8 + max_num_tokens: 8448 + max_seq_len: 5120 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml index 60a221d996..927fdae988 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml @@ -8,7 +8,7 @@ metadata: script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 8 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml index 8724f191f5..8c138fc7f0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml @@ -8,7 +8,7 @@ metadata: script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 11 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index 738c720650..a4af6a8596 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -8,7 +8,7 @@ metadata: script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 10 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml index af30a466be..cf7aaf0f6c 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml @@ -8,7 +8,7 @@ metadata: script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 13 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml index c44b3f6bba..a56926befd 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml @@ -8,7 +8,7 @@ metadata: script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 9 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml index b7a79d7434..54854c0bf5 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml @@ -8,7 +8,7 @@ metadata: script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 12 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index 73a27246c0..99121fca3d 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 1 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml index e95e71ca15..6dcc5d71d3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 3 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index 6055421a27..d934ef4c0a 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 0 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml index 6b47c0fc36..0a37ad83db 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 2 - dataset_file: datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml index 1e71708f57..9c045491cc 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -2,14 +2,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 7 - dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml index 06900691bc..fc4e31ed35 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 14 - dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml index 13572a6049..83e3521db0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 5 - dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml index 30e6152302..baaa80158b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 7 - dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index 55391a698c..7e722b4424 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 4 - dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml index 62301215e9..2205179880 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml @@ -1,14 +1,14 @@ metadata: model_name: deepseek-r1-fp4 precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-V2 + model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 6 - dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..78081a23ac --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX.yaml @@ -0,0 +1,112 @@ +metadata: + model_name: kimi-k2-thinking-fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 6 + dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 00:45:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 1.0 + streaming: true + concurrency_list: '16384' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 1024 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + load_balancer: + num_slots: 384 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + trust_remote_code: true + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 8 + max_num_tokens: 8448 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..ce6a85757b --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-UCX.yaml @@ -0,0 +1,112 @@ +metadata: + model_name: kimi-k2-thinking-fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 6 + dataset_file: disagg_datasets/kimi-k2-8192-1024-20000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 00:45:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 1.0 + streaming: true + concurrency_list: '8192' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + load_balancer: + num_slots: 416 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + trust_remote_code: true + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + trust_remote_code: true diff --git a/tests/integration/defs/perf/disagg/testlist/disagg_gb300.txt b/tests/integration/defs/perf/disagg/testlist/disagg_gb300.txt new file mode 100644 index 0000000000..4e0bf609f2 --- /dev/null +++ b/tests/integration/defs/perf/disagg/testlist/disagg_gb300.txt @@ -0,0 +1,2 @@ +test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX] +test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL] diff --git a/tests/integration/defs/perf/disagg/testlist/wideep.txt b/tests/integration/defs/perf/disagg/testlist/wideep.txt index 55e7bd4721..28684e096f 100644 --- a/tests/integration/defs/perf/disagg/testlist/wideep.txt +++ b/tests/integration/defs/perf/disagg/testlist/wideep.txt @@ -8,6 +8,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-UCX] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL] @@ -15,3 +17,4 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_ # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX] test_disagg.py::TestDisaggBenchmark::test_accuracy[wideep_accuracy_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_accuracy[wideep_accuracy_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-UCX] diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index 9fb72fbacb..c050fdd468 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -82,7 +82,11 @@ class EnvManager: @staticmethod def get_model_dir() -> str: - return os.getenv("MODEL_DIR", "") + return os.getenv("MODEL_DIR", "") + + @staticmethod + def get_dataset_dir() -> str: + return os.getenv("DATASET_DIR", "") @staticmethod def get_output_path() -> str: @@ -99,10 +103,11 @@ class EnvManager: return os.getenv("INSTALL_MODE", "none") @staticmethod - def get_container_mount() -> str: + def get_container_mount(model_name: str = "") -> str: work_dir = EnvManager.get_work_dir() script_dir = EnvManager.get_script_dir() model_dir = EnvManager.get_model_dir() + dataset_dir = EnvManager.get_dataset_dir() output_path = EnvManager.get_output_path() repo_dir = EnvManager.get_repo_dir() trtllm_wheel_path = EnvManager.get_trtllm_wheel_path() @@ -114,10 +119,16 @@ class EnvManager: f"{output_path}:{output_path}", ] + # Kimi-K2 needs 640G of shared memory, otherwise will cause host memory OOM. + if model_name.find("kimi-k2") != -1: + mounts.append("tmpfs:/dev/shm:size=640G") + + if dataset_dir and not dataset_dir.startswith("<"): + mounts.append(f"{dataset_dir}:{dataset_dir}") # Add repo_dir if available - if repo_dir: + if repo_dir and not repo_dir.startswith("<"): mounts.append(f"{repo_dir}:{repo_dir}") - if trtllm_wheel_path: + if trtllm_wheel_path and not trtllm_wheel_path.startswith("<"): trtllm_wheel_dir = os.path.dirname(trtllm_wheel_path) mounts.append(f"{trtllm_wheel_dir}:{trtllm_wheel_dir}") return ",".join(mounts) diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py index f7eeafd0cd..7ee64d410d 100644 --- a/tests/integration/defs/perf/disagg/utils/config_loader.py +++ b/tests/integration/defs/perf/disagg/utils/config_loader.py @@ -88,9 +88,9 @@ DEFAULT_METRICS_CONFIG = { log_file="bench.log", extractor_pattern=r""" ^.*?Median\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n - ^.*?(?:\n|.)*?$\n + (?:.*\n)*? ^.*?Median\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n - ^.*?(?:\n|.)*?$\n + (?:.*\n)*? ^.*?Benchmark\ with\ concurrency\ (\d+)\ done """, metric_names=["SERVER_MEDIAN_TTFT", "SERVER_MEDIAN_E2EL"], @@ -99,21 +99,29 @@ DEFAULT_METRICS_CONFIG = { log_file="bench.log", extractor_pattern=r""" ^.*?Mean\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?Median\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?P99\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n - ^.*?(?:\n|.)*?$\n + (?:.*\n)*? ^.*?Mean\ TPOT\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?Median\ TPOT\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?P99\ TPOT\ \(ms\):\s+([0-9.]+).*?$\n - ^.*?(?:\n|.)*?$\n + (?:.*\n)*? ^.*?Mean\ ITL\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?Median\ ITL\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?P99\ ITL\ \(ms\):\s+([0-9.]+).*?$\n - ^.*?(?:\n|.)*?$\n + (?:.*\n)*? ^.*?Mean\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?Median\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? ^.*?P99\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n - ^.*?(?:\n|.)*?$\n + (?:.*\n)*? ^.*?Benchmark\ with\ concurrency\ (\d+)\ done """, metric_names=[ @@ -308,7 +316,7 @@ class ConfigLoader: supported_gpus = metadata.get("supported_gpus", ["GB200", "GB300", "H100", "B200", "B300"]) # Override config with environment variables (in memory only, do not write back) - config_data = self._apply_env_overrides(config_data) + config_data = self._apply_env_overrides(config_data, model_name) # Generate benchmark_type from sequence configuration benchmark_type = self._generate_benchmark_type(config_data) @@ -440,7 +448,7 @@ class ConfigLoader: logger.debug(f"Using default metrics config for {test_category}") return default_config - def _apply_env_overrides(self, config_data: dict) -> dict: + def _apply_env_overrides(self, config_data: dict, model_name: str) -> dict: """Apply environment variable overrides to configuration. Intelligently replaces empty or None values based on field path. @@ -461,7 +469,7 @@ class ConfigLoader: ("slurm", "partition"): lambda: EnvManager.get_slurm_partition(), ("slurm", "account"): lambda: EnvManager.get_slurm_account(), ("slurm", "job_name"): lambda: EnvManager.get_slurm_job_name(), - ("environment", "container_mount"): lambda: EnvManager.get_container_mount(), + ("environment", "container_mount"): lambda: EnvManager.get_container_mount(model_name), ("environment", "container_image"): lambda: EnvManager.get_container_image(), ("environment", "trtllm_repo"): lambda: EnvManager.get_repo_dir(), ("environment", "trtllm_wheel_path"): lambda: EnvManager.get_trtllm_wheel_path(), @@ -500,7 +508,7 @@ class ConfigLoader: """ metadata = config.get("metadata", {}) dataset_file = metadata.get("dataset_file", "") - return os.path.join(EnvManager.get_model_dir(), dataset_file) + return os.path.join(EnvManager.get_dataset_dir(), dataset_file) def _get_script_file(self, config: dict) -> str: """Get script file by combining scripts directory with script file name. diff --git a/tests/integration/defs/perf/disagg/utils/config_validator.py b/tests/integration/defs/perf/disagg/utils/config_validator.py index 508e1b53ac..39b65a4e1b 100644 --- a/tests/integration/defs/perf/disagg/utils/config_validator.py +++ b/tests/integration/defs/perf/disagg/utils/config_validator.py @@ -83,5 +83,5 @@ class ConfigValidator: osl = extracted_config["osl"] ctx_max_seq_len = extracted_config["ctx_max_seq_len"] gen_max_seq_len = extracted_config["gen_max_seq_len"] - assert ctx_max_seq_len > (isl + osl), "config error: ctx_max_seq_len <= (isl + osl)" + assert ctx_max_seq_len > isl, "config error: ctx_max_seq_len > isl" assert gen_max_seq_len > (isl + osl), "config error: gen_max_seq_len <= (isl + osl)" From 96d9b67d658a5212d4d05f03847e006c300d4c5c Mon Sep 17 00:00:00 2001 From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com> Date: Mon, 8 Dec 2025 17:27:13 +0800 Subject: [PATCH 010/172] [https://nvbugs/5527655][test] Add test case for RCCA 5527655 (#9511) Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/compare_backends.py | 4 +++ .../defs/perf/pytorch_model_config.py | 29 +++++++++++++++++++ .../test_lists/qa/llm_perf_core.yml | 2 +- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/perf/disagg/compare_backends.py b/tests/integration/defs/perf/disagg/compare_backends.py index d6fb84e4cb..1812fd36d5 100644 --- a/tests/integration/defs/perf/disagg/compare_backends.py +++ b/tests/integration/defs/perf/disagg/compare_backends.py @@ -50,6 +50,10 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): sys.exit(0) # Read CSV file + if not os.path.exists(csv_path): + print(f"CSV file not found: {csv_path}") + sys.exit(0) + df = pd.read_csv(csv_path) if len(df) == 0: diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index c59b68dcc1..8a0678e16f 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -79,6 +79,35 @@ def get_model_yaml_config(model_label: str, } } }, + { + 'patterns': [ + 'deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:32-maxnt:32768-input_output_len:8192,1024-reqs:20-con:1-ep:1-gpus:4' + ], + 'config': { + 'enable_iter_perf_stats': True, + 'print_iter_log': False, + 'cuda_graph_config': { + 'max_batch_size': 16, + 'enable_padding': False + }, + 'moe_config': { + 'backend': 'TRTLLM', + 'max_num_tokens': 32768 + }, + 'speculative_config': { + 'decoding_type': 'MTP', + 'num_nextn_predict_layers': 3 + }, + 'disable_overlap_scheduler': True, + 'enable_autotuner': True, + 'kv_cache_config': { + 'free_gpu_memory_fraction': 0.6, + 'enable_block_reuse': True, + 'enable_partial_reuse': False + }, + 'enable_chunked_prefill': True + } + }, # DeepSeek R1 models with large batch sizes and cuda graph padding { 'patterns': [ diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index 5c1b3ba0ff..b059ee3f8c 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -285,7 +285,7 @@ llm_perf_core: - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120) - + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:32-maxnt:32768-input_output_len:8192,1024-reqs:20-con:1-ep:1-tp:4-gpus:4] TIMEOUT(120) # 12: H100, H20, H200, B200, B300 test cases - condition: From 52f78e4000cf2d00b321d12b324e4e3e4e4dac31 Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Mon, 8 Dec 2025 19:26:01 +0800 Subject: [PATCH 011/172] [http://nvbugs/5649010][fix] fix test_auto_scaling.py::test_worker_restart timeout (#9775) Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- .../defs/disaggregated/test_auto_scaling.py | 21 +++++++++---------- tests/integration/test_lists/waives.txt | 1 - 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_auto_scaling.py b/tests/integration/defs/disaggregated/test_auto_scaling.py index d3189e2eec..b96a6b3615 100644 --- a/tests/integration/defs/disaggregated/test_auto_scaling.py +++ b/tests/integration/defs/disaggregated/test_auto_scaling.py @@ -262,6 +262,7 @@ def terminate(*args, show_log_lines=30, release_port=True): print(f"Failed to tail {arg.log_path}: {e}") print(f"Traceback: {traceback.format_exc()}") if arg.process: + print(f"Killing process {arg.process.pid}") try: arg.process.kill() arg.process.wait(timeout=10) @@ -274,6 +275,8 @@ def terminate(*args, show_log_lines=30, release_port=True): USED_PORTS.discard(arg.port) except Exception: print(f"Failed to terminate process {arg.process.pid}") + else: + print(f"Process is None on port {arg.port}") def request_completion(model_name, prompt, port): @@ -396,7 +399,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config, port=disagg_port) print(response) # kill gen1, the request should fail - terminate(gen_worker1, release_port=False) + terminate(gen_worker1) await asyncio.sleep(CHECK_STATUS_INTERVAL) verify_cluster_info(False, 1, 0, port=disagg_port) with pytest.raises(Exception): @@ -422,7 +425,7 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config, assert len(response.choices[0].text) >= 1 # kill ctx1, the request should fail - terminate(ctx_worker1, release_port=False) + terminate(ctx_worker1) await asyncio.sleep(CHECK_STATUS_INTERVAL) verify_cluster_info(False, 0, 1, port=disagg_port) with pytest.raises(Exception): @@ -441,15 +444,11 @@ async def test_worker_restart(model_name, disagg_server_config, worker_config, response_text = response.choices[0].text assert len(response.choices[0].text) >= 1 - # restart ctx1 and gen1 with the same ports, we have 2 ctxs and 2 gens now - ctx_worker1 = run_ctx_worker(model_name, - worker_config, - work_dir, - port=ctx_worker1.port) - gen_worker1 = run_gen_worker(model_name, - worker_config, - work_dir, - port=gen_worker1.port) + # start ctx1 and gen1 again, we have 2 ctxs and 2 gens now + # Note: Do NOT start them with the same ports as the previous ones, the ports may be not released immediately after terminate, + # causing a port conflict and test timeout. + ctx_worker1 = run_ctx_worker(model_name, worker_config, work_dir) + gen_worker1 = run_gen_worker(model_name, worker_config, work_dir) await wait_for_worker_ready(ctx_worker1.port) await wait_for_worker_ready(gen_worker1.port) await asyncio.sleep(CHECK_STATUS_INTERVAL) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index c90ff83e2a..28714e45c0 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -437,7 +437,6 @@ unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_ unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5649010) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5722653) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5722653) From 98db262a6715cca94320f768d6a41a5e5e2f1a5e Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Mon, 8 Dec 2025 13:26:21 +0200 Subject: [PATCH 012/172] [None][fix] Switch AutoDeploy's default allreduce strategy to NCCL (#9666) Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- examples/auto_deploy/nano_v3.yaml | 1 + tensorrt_llm/_torch/auto_deploy/config/default.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/auto_deploy/nano_v3.yaml b/examples/auto_deploy/nano_v3.yaml index a87e262425..a847d9a8d4 100644 --- a/examples/auto_deploy/nano_v3.yaml +++ b/examples/auto_deploy/nano_v3.yaml @@ -14,6 +14,7 @@ kv_cache_config: transforms: detect_sharding: sharding_dims: ['ep', 'bmm'] + allreduce_strategy: 'AUTO' manual_config: head_dim: 128 tp_plan: diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml index a7251de20a..4edf3de150 100644 --- a/tensorrt_llm/_torch/auto_deploy/config/default.yaml +++ b/tensorrt_llm/_torch/auto_deploy/config/default.yaml @@ -81,7 +81,7 @@ transforms: sharding_source: ['manual', 'factory', 'heuristic'] support_partial_config: true sharding_dims: ['tp', 'ep', 'bmm'] - allreduce_strategy: 'AUTO' + allreduce_strategy: 'NCCL' dist_backend: auto requires_shape_prop: true sharding_transform_executor: From 1c7b7cdd475502af7bf040ffc8914192bc1152d8 Mon Sep 17 00:00:00 2001 From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com> Date: Mon, 8 Dec 2025 23:12:32 +0800 Subject: [PATCH 013/172] [TRTLLM-9506][fix] Fix AR for DeepSeek-R1 2 model path (#9661) Signed-off-by: qgai --- tensorrt_llm/_torch/models/modeling_deepseekv3.py | 13 +++++++++---- tensorrt_llm/_torch/models/modeling_speculative.py | 3 +++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 4963471ed8..40fbaa983d 100755 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -1233,13 +1233,13 @@ class DeepseekV3DecoderLayer(DecoderLayer): hidden_states, residual = self.moe_allreduce( fc2_output, all_reduce_params=moe_all_reduce_params) else: - if spec_metadata is not None and spec_metadata.is_layer_capture( - self.layer_idx): - spec_metadata.maybe_capture_hidden_states( - self.layer_idx, hidden_states, residual) if self.next_layer_layernorm is not None: hidden_states, residual = self.next_layer_layernorm( hidden_states, residual) + if spec_metadata is not None and spec_metadata.is_layer_capture( + self.layer_idx): + spec_metadata.maybe_capture_hidden_states( + self.layer_idx, hidden_states, None) return hidden_states, residual @@ -1357,6 +1357,7 @@ class DeepseekV3MTP(DeepseekV3DecoderLayer): embed_tokens: Embedding, attn_metadata: AttentionMetadata, all_rank_num_tokens: Optional[List[int]] = None, + spec_metadata: Optional[SpecMetadata] = None, **kwargs, ) -> torch.Tensor: @@ -1433,6 +1434,10 @@ class DeepseekV3MTP(DeepseekV3DecoderLayer): else: hidden_states, _ = self.shared_head.norm(hidden_states, residual) + # It's for 2-model path, capture the hidden states + if spec_metadata is not None: + spec_metadata.maybe_capture_hidden_states(0, hidden_states, None) + return hidden_states diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index 8991768ad3..a94e288172 100755 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -455,6 +455,7 @@ class MTPDraftModel(nn.Module): hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, all_rank_num_tokens: Optional[List[int]] = None, + spec_metadata: Optional[SpecMetadata] = None, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: hidden_states = self.layers( @@ -464,6 +465,7 @@ class MTPDraftModel(nn.Module): embed_tokens=self.embed_tokens, attn_metadata=attn_metadata, all_rank_num_tokens=all_rank_num_tokens, + spec_metadata=spec_metadata, ) return hidden_states @@ -518,6 +520,7 @@ class MTPDraftModelForCausalLM(DecoderModelForCausalLM[MTPDraftModel, hidden_states=hidden_states, attn_metadata=attn_metadata, all_rank_num_tokens=attn_metadata.all_rank_num_tokens, + spec_metadata=spec_metadata, **kwargs) return self.logits_processor.forward( output, From f6df9eb2a6d75d1c0bf5cae9eb816463408c6868 Mon Sep 17 00:00:00 2001 From: Frank <3429989+FrankD412@users.noreply.github.com> Date: Mon, 8 Dec 2025 10:37:40 -0800 Subject: [PATCH 014/172] [TRTLLM-9089][chore] Port prepare_dataset into trtllm-bench (#9250) --- .gitignore | 3 + benchmarks/cpp/prepare_dataset.py | 2 +- ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md | 20 +- docs/source/developer-guide/perf-analysis.md | 10 +- .../developer-guide/perf-benchmarking.md | 23 +- .../legacy/performance/perf-analysis.md | 8 +- .../legacy/performance/perf-benchmarking.md | 4 +- examples/llm-api/llm_mgmn_trtllm_bench.sh | 9 +- .../llm-api/out_of_tree_example/readme.md | 12 +- examples/models/core/deepseek_v3/README.md | 24 +- examples/models/core/qwen/README.md | 11 +- tensorrt_llm/bench/dataset/__init__.py | 0 tensorrt_llm/bench/dataset/prepare_dataset.py | 93 ++++++ .../bench/dataset/prepare_real_data.py | 305 ++++++++++++++++++ .../bench/dataset/prepare_synthetic_data.py | 104 ++++++ tensorrt_llm/bench/dataset/utils.py | 96 ++++++ tensorrt_llm/commands/bench.py | 2 + .../defs/perf/README_release_test.md | 20 +- tests/integration/defs/perf/test_perf.py | 18 +- tests/integration/defs/perf/utils.py | 4 +- tests/integration/defs/test_e2e.py | 62 ++-- .../unit/singlegpu/test_ad_trtllm_bench.py | 14 +- tests/unittest/tools/test_prepare_dataset.py | 36 ++- 23 files changed, 744 insertions(+), 136 deletions(-) create mode 100644 tensorrt_llm/bench/dataset/__init__.py create mode 100644 tensorrt_llm/bench/dataset/prepare_dataset.py create mode 100644 tensorrt_llm/bench/dataset/prepare_real_data.py create mode 100644 tensorrt_llm/bench/dataset/prepare_synthetic_data.py create mode 100644 tensorrt_llm/bench/dataset/utils.py diff --git a/.gitignore b/.gitignore index 78d8da20e4..ccecb77e98 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,6 @@ compile_commands.json # Enroot sqsh files enroot/sw-tensorrt-docker+*.sqsh enroot/tensorrt_llm.devel.sqsh + +# MacOSX Files +.DS_Store diff --git a/benchmarks/cpp/prepare_dataset.py b/benchmarks/cpp/prepare_dataset.py index 2f7b5516b6..3b9665fd29 100644 --- a/benchmarks/cpp/prepare_dataset.py +++ b/benchmarks/cpp/prepare_dataset.py @@ -49,7 +49,7 @@ class RootArgs(BaseModel): return self -@click.group() +@click.group(deprecated=True) @click.option( "--tokenizer", required=True, diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index bbb276a6e9..ad0e9975a1 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -251,13 +251,13 @@ To do the benchmark, run the following command: ```bash # generate synthetic dataset -python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \ - --stdout \ - --tokenizer nvidia/DeepSeek-R1-FP4 \ +trtllm-bench --model nvidia/DeepSeek-R1-FP4 \ + prepare-dataset \ + --output dataset.txt \ token-norm-dist \ --input-mean 1024 --output-mean 2048 \ --input-stdev 0 --output-stdev 0 \ - --num-requests 49152 > dataset.txt + --num-requests 49152 YOUR_DATA_PATH=./dataset.txt @@ -353,13 +353,14 @@ To do the benchmark, run the following command: ```bash # generate synthetic dataset -python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \ - --stdout \ - --tokenizer deepseek-ai/DeepSeek-R1 \ +trtllm-bench --model nvidia/DeepSeek-R1-FP4 \ + prepare-dataset \ + --output dataset.txt \ token-norm-dist \ --input-mean 1024 --output-mean 2048 \ --input-stdev 0 --output-stdev 0 \ - --num-requests 5120 > dataset.txt + --num-requests 5120 + YOUR_DATA_PATH=./dataset.txt cat >./extra-llm-api-config.yml< /tmp/dataset.txt +trtllm-bench --model ${MODEL_PATH} \ + prepare-dataset \ + --output dataset.txt \ + token-norm-dist \ + --num-requests=${NUM_SAMPLES} \ + --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 # Benchmark and profile TLLM_PROFILE_START_STOP=100-150 nsys profile \ diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md index 57ef00d8f6..63bd9f6f8f 100644 --- a/docs/source/developer-guide/perf-benchmarking.md +++ b/docs/source/developer-guide/perf-benchmarking.md @@ -152,7 +152,7 @@ directory. For example, to generate a synthetic dataset of 1000 requests with a 128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run: ```shell -python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt +trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 ``` ### Running with the PyTorch Workflow @@ -233,13 +233,13 @@ The PyTorch workflow supports benchmarking with LoRA (Low-Rank Adaptation) adapt **Preparing LoRA Dataset** -Use `prepare_dataset.py` with LoRA-specific options to generate requests with LoRA metadata: +Use `trtllm-bench prepare-dataset` with LoRA-specific options to generate requests with LoRA metadata: ```shell -python3 benchmarks/cpp/prepare_dataset.py \ - --stdout \ +trtllm-bench \ + --model /path/to/tokenizer \ + prepare-dataset \ --rand-task-id 0 1 \ - --tokenizer /path/to/tokenizer \ --lora-dir /path/to/loras \ token-norm-dist \ --num-requests 100 \ @@ -310,17 +310,18 @@ Each subdirectory should contain the LoRA adapter files for that specific task. To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above. First, prepare the dataset: -```python -python ./benchmarks/cpp/prepare_dataset.py \ - --tokenizer Qwen/Qwen2-VL-2B-Instruct \ - --stdout \ - dataset \ +```bash +trtllm-bench \ + --model Qwen/Qwen2-VL-2B-Instruct \ + prepare-dataset \ + --output mm_data.jsonl + real-dataset --dataset-name lmms-lab/MMMU \ --dataset-split test \ --dataset-image-key image \ --dataset-prompt-key question \ --num-requests 10 \ - --output-len-dist 128,5 > mm_data.jsonl + --output-len-dist 128,5 ``` It will download the media files to `/tmp` directory and prepare the dataset with their paths. Note that the `prompt` fields are texts and not tokenized ids. This is due to the fact that the `prompt` and the media (image/video) are processed by a preprocessor for multimodal files. diff --git a/docs/source/legacy/performance/perf-analysis.md b/docs/source/legacy/performance/perf-analysis.md index f72437f4e9..51abd6460d 100644 --- a/docs/source/legacy/performance/perf-analysis.md +++ b/docs/source/legacy/performance/perf-analysis.md @@ -66,10 +66,10 @@ Say we want to profile iterations 100 to 150 on a trtllm-bench/trtllm-serve run, #!/bin/bash # Prepare dataset for the benchmark -python3 benchmarks/cpp/prepare_dataset.py \ - --tokenizer=${MODEL_PATH} \ - --stdout token-norm-dist --num-requests=${NUM_SAMPLES} \ - --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt +trtllm-bench \ + --model=${MODEL_PATH} prepare-dataset \ + --output /tmp/dataset.txt token-norm-dist --num-requests=${NUM_SAMPLES} \ + --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 # Benchmark and profile TLLM_PROFILE_START_STOP=100-150 nsys profile \ diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md index 5efd6625f0..9530b6da1b 100644 --- a/docs/source/legacy/performance/perf-benchmarking.md +++ b/docs/source/legacy/performance/perf-benchmarking.md @@ -110,7 +110,7 @@ of 128:128. To run the benchmark from start to finish, run the following commands: ```shell -python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt +trtllm-bench --tokenizer meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 trtllm-bench --model meta-llama/Llama-3.1-8B build --dataset /tmp/synthetic_128_128.txt --quantization FP8 trtllm-bench --model meta-llama/Llama-3.1-8B throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-3.1-8B/tp_1_pp_1 ``` @@ -207,7 +207,7 @@ directory. For example, to generate a synthetic dataset of 1000 requests with a 128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run: ```shell -benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt +trtllm-bench --tokenizer meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 ``` ### Building a Benchmark Engine diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh index 4bd7b1d8c8..f8167966a8 100644 --- a/examples/llm-api/llm_mgmn_trtllm_bench.sh +++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh @@ -71,7 +71,6 @@ # not supported in Slurm mode, you need to download the model and put it in # the LOCAL_MODEL directory. -export prepare_dataset="$SOURCE_ROOT/benchmarks/cpp/prepare_dataset.py" export data_path="$WORKDIR/token-norm-dist.txt" echo "Preparing dataset..." @@ -86,14 +85,14 @@ srun -l \ --mpi=pmix \ bash -c " $PROLOGUE - python3 $prepare_dataset \ - --tokenizer=$LOCAL_MODEL \ - --stdout token-norm-dist \ + trtllm-bench --model=$LOCAL_MODEL prepare-dataset \ + --output $data_path \ + token-norm-dist \ --num-requests=100 \ --input-mean=128 \ --output-mean=128 \ --input-stdev=0 \ - --output-stdev=0 > $data_path + --output-stdev=0 " echo "Running benchmark..." diff --git a/examples/llm-api/out_of_tree_example/readme.md b/examples/llm-api/out_of_tree_example/readme.md index d93981bb41..f506ae7cf5 100644 --- a/examples/llm-api/out_of_tree_example/readme.md +++ b/examples/llm-api/out_of_tree_example/readme.md @@ -42,7 +42,17 @@ Similar to the quickstart example, you can use the same CLI argument with `trtll Prepare the dataset: ``` -python ./benchmarks/cpp/prepare_dataset.py --tokenizer ./model_ckpt --stdout dataset --dataset-name lmms-lab/MMMU --dataset-split test --dataset-image-key image --dataset-prompt-key "question" --num-requests 100 --output-len-dist 128,5 > mm_data.jsonl +trtllm-bench \ + --model ./model_ckpt \ + prepare-dataset \ + --output mm_data.jsonl + real-dataset + --dataset-name lmms-lab/MMMU \ + --dataset-split test \ + --dataset-image-key image \ + --dataset-prompt-key question \ + --num-requests 10 \ + --output-len-dist 128,5 ``` Run the benchmark: diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index 934db2e493..1bb67546f9 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -140,12 +140,13 @@ To avoid OOM (out of memory) error, you need to adjust the values of "--max_batc #### ISL-64k-OSL-1024 ```bash DS_R1_NVFP4_MODEL_PATH=/path/to/DeepSeek-R1 -python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ - --stdout --tokenizer ${DS_R1_NVFP4_MODEL_PATH} \ +trtllm-bench --model ${DS_R1_NVFP4_MODEL_PATH} \ + prepare-dataset \ + --output /tmp/benchmarking_64k.txt \ token-norm-dist \ --input-mean 65536 --output-mean 1024 \ --input-stdev 0 --output-stdev 0 \ - --num-requests 24 > /tmp/benchmarking_64k.txt + --num-requests 24 cat < /tmp/extra-llm-api-config.yml cuda_graph_config: @@ -166,12 +167,13 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t #### ISL-128k-OSL-1024 ```bash DS_R1_NVFP4_MODEL_PATH=/path/to/DeepSeek-R1 -python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ - --stdout --tokenizer ${DS_R1_NVFP4_MODEL_PATH} \ +trtllm-bench --model ${DS_R1_NVFP4_MODEL_PATH} \ + prepare-dataset \ + --output /tmp/benchmarking_128k.txt \ token-norm-dist \ --input-mean 131072 --output-mean 1024 \ --input-stdev 0 --output-stdev 0 \ - --num-requests 4 > /tmp/benchmarking_128k.txt + --num-requests 4 cat < /tmp/extra-llm-api-config.yml cuda_graph_config: @@ -356,7 +358,7 @@ curl http://localhost:8000/v1/completions \ }' ``` -For DeepSeek-R1 FP4, use the model name `nvidia/DeepSeek-R1-FP4-v2`. +For DeepSeek-R1 FP4, use the model name `nvidia/DeepSeek-R1-FP4-v2`. For DeepSeek-V3, use the model name `deepseek-ai/DeepSeek-V3`. ### Disaggregated Serving @@ -610,10 +612,10 @@ trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path / Step 1: Prepare dataset and `extra-llm-api-config.yml`. ```bash -python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \ - --tokenizer=/path/to/DeepSeek-R1 \ - --stdout token-norm-dist --num-requests=49152 \ - --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt +trtllm-bench --model /path/to/DeepSeek-R1 \ + prepare-dataset --output /tmp/dataset.txt \ + token-norm-dist --num-requests=49152 \ + --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml < ${path_data} +trtllm-bench\ + --model=${folder_model} \ + prepare-dataset --output ${path_data} \ + token-norm-dist --num-requests=$(( concurrency * 5 )) \ + --input-mean=${min_input_len} --output-mean=${min_output_len} --input-stdev=0 --output-stdev=0 ``` ### Serving diff --git a/tensorrt_llm/bench/dataset/__init__.py b/tensorrt_llm/bench/dataset/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorrt_llm/bench/dataset/prepare_dataset.py b/tensorrt_llm/bench/dataset/prepare_dataset.py new file mode 100644 index 0000000000..aa7f4eb722 --- /dev/null +++ b/tensorrt_llm/bench/dataset/prepare_dataset.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Optional, Tuple + +import click +from pydantic import BaseModel, model_validator +from transformers import AutoTokenizer + +from tensorrt_llm.bench.dataset.prepare_real_data import real_dataset +from tensorrt_llm.bench.dataset.prepare_synthetic_data import token_norm_dist, token_unif_dist + + +class RootArgs(BaseModel): + tokenizer: str + output: str + random_seed: int + task_id: int + trust_remote_code: bool = False + rand_task_id: Optional[Tuple[int, int]] + lora_dir: Optional[str] = None + + @model_validator(mode="after") + def validate_tokenizer(self): + try: + tokenizer = AutoTokenizer.from_pretrained( + self.tokenizer, padding_side="left", trust_remote_code=self.trust_remote_code + ) + except EnvironmentError as e: + raise ValueError( + "Cannot find a tokenizer from the given string because of " + f"{e}\nPlease set tokenizer to the directory that contains " + "the tokenizer, or set to a model name in HuggingFace." + ) + tokenizer.pad_token = tokenizer.eos_token + self.tokenizer = tokenizer + + return self + + +@click.group(name="prepare-dataset") +@click.option( + "--output", type=str, help="Output json filename.", default="preprocessed_dataset.json" +) +@click.option( + "--random-seed", required=False, type=int, help="random seed for token_ids", default=420 +) +@click.option("--task-id", type=int, default=-1, help="LoRA task id") +@click.option("--rand-task-id", type=int, default=None, nargs=2, help="Random LoRA Tasks") +@click.option("--lora-dir", type=str, default=None, help="Directory containing LoRA adapters") +@click.option( + "--log-level", default="info", type=click.Choice(["info", "debug"]), help="Logging level." +) +@click.option( + "--trust-remote-code", + is_flag=True, + default=False, + envvar="TRUST_REMOTE_CODE", + help="Trust remote code.", +) +@click.pass_context +def prepare_dataset(ctx, **kwargs): + """Prepare dataset for benchmarking with trtllm-bench.""" + model = ctx.obj.model or ctx.obj.checkpoint_path + output_path = Path(kwargs["output"]) + output_path.parent.mkdir(parents=True, exist_ok=True) + + ctx.obj = RootArgs( + tokenizer=model, + output=kwargs["output"], + random_seed=kwargs["random_seed"], + task_id=kwargs["task_id"], + rand_task_id=kwargs["rand_task_id"], + lora_dir=kwargs["lora_dir"], + trust_remote_code=kwargs["trust_remote_code"], + ) + + +prepare_dataset.add_command(real_dataset) +prepare_dataset.add_command(token_norm_dist) +prepare_dataset.add_command(token_unif_dist) diff --git a/tensorrt_llm/bench/dataset/prepare_real_data.py b/tensorrt_llm/bench/dataset/prepare_real_data.py new file mode 100644 index 0000000000..063650c926 --- /dev/null +++ b/tensorrt_llm/bench/dataset/prepare_real_data.py @@ -0,0 +1,305 @@ +import logging +import random +import re +import tempfile +from functools import partial +from typing import Optional + +import click +from datasets import load_dataset +from PIL import Image +from pydantic import BaseModel, model_validator + +from tensorrt_llm.bench.dataset.utils import ( + generate_multimodal_dataset, + generate_text_dataset, + get_norm_dist_lengths, + write_dataset_to_file, +) + + +def validate_output_len_dist(ctx, param, value): + """Validate the --output-len-dist option.""" + if value is None: + return value + m = re.match(r"(\d+),(\d+)", value) + if m: + return int(m.group(1)), int(m.group(2)) + else: + raise AssertionError( + "Incorrect specification for --output-len-dist. Correct format: " + "--output-len-dist ," + ) + + +class DatasetConfig(BaseModel): + """Dataset configurations.""" + + """Name of the dataset on HuggingFace.""" + name: str + """Config name of the dataset if existing.""" + config_name: Optional[str] = None + """Split of the dataset. Typical values: train, validation, test. Setting to None will include all splits.""" + split: Optional[str] + """The dataset dictionary used for the input sentence.""" + input_key: Optional[str] = None + """The dataset dictionary key used for the prompt of the input sentence. Must not be set when prompt is set.""" + image_key: Optional[str] = None + """The dataset dictionary key used for the images.""" + prompt_key: Optional[str] = None + """The prompt sentence to be added to the input sentence. Must not be set when prompt_key is set.""" + prompt: Optional[str] = None + """The dataset dictionary key used to derive the output sequence length. Set to None if no output key.""" + output_key: Optional[str] + + @model_validator(mode="after") + def check_prompt(self) -> "DatasetConfig": + if self.prompt_key and self.prompt: + raise AssertionError("--prompt-key and --prompt cannot be set at the same time.") + if (not self.prompt_key) and (not self.prompt): + raise AssertionError("Either --prompt-key or --prompt must be set.") + return self + + @property + def query(self): + """Generate the query for HuggingFace `datasets.load_dataset()`.""" + if self.config_name: + return [self.name, self.config_name] + else: + return [self.name] + + def get_prompt(self, req): + """Get the prompt sentence from the given request.""" + if self.prompt_key: + assert self.prompt_key in req, ( + f"Dataset {self.name} does not have key '{self.prompt_key}'. " + "Please set --prompt-key to one of the available keys: " + f"{req.keys()}" + ) + return req[self.prompt_key] + else: + return self.prompt + + def get_input(self, req): + """Get the input sentence from the given request.""" + assert self.input_key in req, ( + f"Dataset {self.name} does not have key '{self.input_key}'. " + "Please set --input-key to one of the available keys: " + f"{req.keys()}" + ) + return req[self.input_key] + + def get_images(self, req): + """Get the images from the given request.""" + image_keys = [self.image_key] + [f"{self.image_key}_{i}" for i in range(1, 8)] + assert any(key in req for key in image_keys), ( + f"Dataset {self.name} does not have key '{self.image_key}'. " + "Please set --dataset-image-key to one of the available keys: " + f"{req.keys()}" + ) + images = [] + for key in image_keys: + if key in req and req[key] is not None: + images.append(req[key]) + return images + + def get_output(self, req): + """Get the output sentence from the given request.""" + if self.output_key is None: + raise RuntimeError( + "--output-key is not set. Please either:\n" + "1. Define output length through --output-len-dist.\n" + f"2. If the dataset {self.name} has key for golden output and " + "you wish to set output length to the length of the golden " + "output, set --output-key." + ) + assert self.output_key in req, ( + f"Dataset {self.name} does not have key '{self.output_key}'. " + "Please set --output-key to one of the available keys: " + f"{req.keys()}" + ) + return req[self.output_key] + + +def load_dataset_from_hf(dataset_config: DatasetConfig): + """Load dataset from HuggingFace. + + Args: + dataset_config: A `DatasetConfig` object that defines the dataset to load. + + Returns: + Dataset iterator. + + Raises: + ValueError: When dataset loading fails due to incorrect dataset config setting. + """ + try: + dataset = iter( + load_dataset( + *dataset_config.query, + split=dataset_config.split, + streaming=True, + trust_remote_code=True, + ) + ) + except ValueError as e: + if "Config" in e: + e += "\n Please add the config name to the dataset config yaml." + elif "split" in e: + e += "\n Please specify supported split in the dataset config yaml." + raise ValueError(e) + + return dataset + + +@click.command(name="real-dataset") +@click.option("--dataset-name", required=True, type=str, help="Dataset name in HuggingFace.") +@click.option( + "--dataset-config-name", + type=str, + default=None, + help="Dataset config name in HuggingFace (if exists).", +) +@click.option("--dataset-split", type=str, required=True, help="Split of the dataset to use.") +@click.option("--dataset-input-key", type=str, help="The dataset dictionary key for input.") +@click.option( + "--dataset-image-key", type=str, default="image", help="The dataset dictionary key for images." +) +@click.option( + "--dataset-prompt-key", + type=str, + default=None, + help="The dataset dictionary key for prompt (if exists).", +) +@click.option( + "--dataset-prompt", + type=str, + default=None, + help="The prompt string when there is no prompt key for the dataset.", +) +@click.option( + "--dataset-output-key", + type=str, + default=None, + help="The dataset dictionary key for output (if exists).", +) +@click.option( + "--num-requests", + type=int, + default=None, + help="Number of requests to be generated. Will be capped to min(dataset.num_rows, num_requests).", +) +@click.option( + "--max-input-len", + type=int, + default=None, + help="Maximum input sequence length for a given request. This will be used to filter out the " + "requests with long input sequence length. Default will include all the requests.", +) +@click.option( + "--output-len-dist", + type=str, + default=None, + callback=validate_output_len_dist, + help="Output length distribution. Default will be the length of the golden output from " + "the dataset. Format: ,. E.g. 100,10 will randomize " + "the output length with mean=100 and variance=10.", +) +@click.pass_obj +def real_dataset(root_args, **kwargs): + """Prepare dataset from real dataset.""" + dataset_config = DatasetConfig( + **{k[8:]: v for k, v in kwargs.items() if k.startswith("dataset_")} + ) + + input_ids = [] + input_lens = [] + output_lens = [] + task_ids = [] + req_cnt = 0 + modality = None + multimodal_texts = [] + multimodal_image_paths = [] + for req in load_dataset_from_hf(dataset_config): + if any(key in req for key in ["image", "image_1", "video"]): + # multimodal input + if "video" in req and req["video"] is not None: + assert "Not supported yet" + assert kwargs["output_len_dist"] is not None, ( + "Output length distribution must be set for multimodal requests." + ) + modality = "image" + text = dataset_config.get_prompt(req) + images = dataset_config.get_images(req) + image_paths = [] + for image in images: + if image is not None: + if isinstance(image, str): + image_paths.append(image) + elif isinstance(image, Image.Image): + with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file: + logging.debug(f"Saving image to {tmp_file.name}") + image = image.convert("RGB") + image.save(tmp_file, "JPEG") + filepath = tmp_file.name + image_paths.append(filepath) + else: + raise ValueError(f"Invalid image path: {image}") + multimodal_texts.append(text) + multimodal_image_paths.append(image_paths) + else: + # text input + prompt = dataset_config.get_prompt(req) + " " + dataset_config.get_input(req) + logging.debug(f"Input sequence: {prompt}") + line = root_args.tokenizer.encode(prompt) + if kwargs["max_input_len"] and len(line) > kwargs["max_input_len"]: + continue + input_ids.append(line) + input_lens.append(len(line)) + + # output if fetch from golden + if kwargs["output_len_dist"] is None: + output_lens.append(len(root_args.tokenizer.encode(dataset_config.get_output(req)))) + + # lora task id + task_id = root_args.task_id + if root_args.rand_task_id is not None: + min_id, max_id = root_args.rand_task_id + task_id = random.randint(min_id, max_id) + task_ids.append(task_id) + + req_cnt += 1 + if kwargs["num_requests"] and req_cnt >= kwargs["num_requests"]: + break + + if ( + kwargs["num_requests"] + and (len(input_ids) if modality is None else len(multimodal_texts)) < kwargs["num_requests"] + ): + logging.warning( + f"Number of requests={len(input_ids) if modality is None else len(multimodal_texts)} is" + f" smaller than the num-requests user set={kwargs['num_requests']}." + ) + + # output if randomized + if kwargs["output_len_dist"] is not None: + osl_mean, osl_stdev = kwargs["output_len_dist"] + output_lens = get_norm_dist_lengths( + osl_mean, + osl_stdev, + len(input_ids) if modality is None else len(multimodal_texts), + root_args.random_seed, + ) + logging.debug(f"Input lengths: {[len(i) for i in input_ids]}") + logging.debug(f"Output lengths: {output_lens}") + if modality is not None: + logging.debug(f"Modality: {modality}") + + dataset_generator = None + if modality is not None: + dataset_generator = partial( + generate_multimodal_dataset, multimodal_texts, multimodal_image_paths + ) + else: + dataset_generator = partial(generate_text_dataset, input_ids) + write_dataset_to_file(dataset_generator(output_lens), root_args.output) diff --git a/tensorrt_llm/bench/dataset/prepare_synthetic_data.py b/tensorrt_llm/bench/dataset/prepare_synthetic_data.py new file mode 100644 index 0000000000..342aa51438 --- /dev/null +++ b/tensorrt_llm/bench/dataset/prepare_synthetic_data.py @@ -0,0 +1,104 @@ +import random +import warnings + +import click + +from tensorrt_llm.bench.dataset.utils import ( + gen_random_tokens, + generate_text_dataset, + get_norm_dist_lengths, + get_unif_dist_lengths, + write_dataset_to_file, +) + + +def _generate_task_ids_and_lora_config(root_args, num_reqs): + """Generate task IDs and determine LoRA configuration based on root_args.""" + if root_args.rand_task_id is None: + task_ids = [root_args.task_id for _ in range(num_reqs)] + else: + min_id, max_id = root_args.rand_task_id + task_ids = [random.randint(min_id, max_id) for _ in range(num_reqs)] + + use_task_ids = root_args.task_id != -1 or root_args.rand_task_id is not None + + # Determine if LoRA should be used (requires both task IDs and lora_dir) + use_lora = use_task_ids and root_args.lora_dir is not None + + # Warn if task IDs are specified but no LoRA directory is provided + if use_task_ids and not use_lora: + warnings.warn( + "Task IDs require LoRA directory. Use --lora-dir or omit task IDs.", UserWarning + ) + + return ( + task_ids, + task_ids if use_task_ids else None, + {"lora_dir": root_args.lora_dir} if use_lora else None, + ) + + +@click.command() +@click.option("--num-requests", required=True, type=int, help="Number of requests to be generated") +@click.option("--input-mean", required=True, type=int, help="normal dist mean for input tokens") +@click.option("--input-stdev", required=True, type=int, help="normal dist stdev for input tokens") +@click.option("--output-mean", required=True, type=int, help="normal dist mean for output tokens") +@click.option("--output-stdev", required=True, type=int, help="normal dist stdev for output tokens") +@click.pass_obj +def token_norm_dist(root_args, **kwargs): + """Prepare synthetic dataset by generating random tokens with normal dist lengths.""" + input_ids = [] + input_lens = [] + output_lens = [] + + input_lens = get_norm_dist_lengths( + kwargs["input_mean"], kwargs["input_stdev"], kwargs["num_requests"], root_args.random_seed + ) + + num_reqs = len(input_lens) + output_lens = get_norm_dist_lengths( + kwargs["output_mean"], kwargs["output_stdev"], num_reqs, root_args.random_seed + ) + input_ids = gen_random_tokens(input_lens, root_args.tokenizer, root_args.random_seed) + _, print_task_ids, lora_config = _generate_task_ids_and_lora_config(root_args, num_reqs) + dataset_generator = generate_text_dataset( + input_ids, output_lens, task_ids=print_task_ids, lora_config=lora_config + ) + write_dataset_to_file(dataset_generator, root_args.output) + + +@click.command() +@click.option("--num-requests", required=True, type=int, help="Number of requests to be generated") +@click.option( + "--input-min", required=True, type=int, help="uniform dist (inclusive) min for input tokens" +) +@click.option( + "--input-max", required=True, type=int, help="normal dist (inclusive) max for input tokens" +) +@click.option( + "--output-min", required=True, type=int, help="normal dist (inclusive) min for output tokens" +) +@click.option( + "--output-max", required=True, type=int, help="normal dist (inclusive) max for output tokens" +) +@click.pass_obj +def token_unif_dist(root_args, **kwargs): + """Prepare synthetic dataset by generating random tokens with normal uniformly lengths.""" + input_ids = [] + input_lens = [] + output_lens = [] + + input_lens = get_unif_dist_lengths( + kwargs["input_min"], kwargs["input_max"], kwargs["num_requests"], root_args.random_seed + ) + + num_reqs = len(input_lens) + output_lens = get_unif_dist_lengths( + kwargs["output_min"], kwargs["output_max"], num_reqs, root_args.random_seed + ) + input_ids = gen_random_tokens(input_lens, root_args.tokenizer, root_args.random_seed) + _, print_task_ids, lora_config = _generate_task_ids_and_lora_config(root_args, num_reqs) + dataset_generator = generate_text_dataset( + input_ids, output_lens, task_ids=print_task_ids, lora_config=lora_config + ) + write_dataset_to_file(dataset_generator, root_args.output) diff --git a/tensorrt_llm/bench/dataset/utils.py b/tensorrt_llm/bench/dataset/utils.py new file mode 100644 index 0000000000..15c9170195 --- /dev/null +++ b/tensorrt_llm/bench/dataset/utils.py @@ -0,0 +1,96 @@ +import json +import math +import os +import random +from pathlib import Path + +import numpy as np + + +def generate_text_dataset(input_ids, output_lens, task_ids=None, lora_config=None): + for i, input_tokens in enumerate(input_ids): + d = {"task_id": i, "input_ids": input_tokens, "output_tokens": output_lens[i]} + + # Add LoRA request if task_ids indicate LoRA usage + if task_ids is not None and lora_config is not None: + task_id = task_ids[i] + if task_id != -1: # -1 means no LoRA + d["lora_request"] = { + "lora_name": f"lora_{task_id}", + "lora_int_id": task_id, + "lora_path": os.path.join(lora_config.get("lora_dir", "loras"), str(task_id)), + } + + yield json.dumps(d, separators=(",", ":"), ensure_ascii=False) + + +def generate_multimodal_dataset(multimodal_texts, multimodal_image_paths, output_lens): + for i, (text, image_paths) in enumerate(zip(multimodal_texts, multimodal_image_paths)): + d = { + "task_id": i, + "prompt": text, + "media_paths": image_paths, + "output_tokens": output_lens[i], + } + yield json.dumps(d, separators=(",", ":"), ensure_ascii=False) + + +def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs, random_seed): + if delay_dist == "constant": + delays = [mean_time_bet_reqs] * num_reqs + elif delay_dist == "exponential_dist": + delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs, random_seed) + + return delays + + +def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs, random_seed): + # set seed for determinism + np.random.seed(random_seed) + return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist() + + +def get_norm_dist_lengths(mean, stdev, num_reqs, random_seed): + # set seed for determinism + np.random.seed(random_seed) + numbers_list = np.random.normal(loc=mean, scale=stdev, size=num_reqs).tolist() + return [max(1, math.ceil(x)) for x in numbers_list] + + +def get_unif_dist_lengths(min_len, max_len, num_reqs, random_seed): + # set seed for determinism + rng = np.random.default_rng(random_seed) + numbers = rng.integers(low=min_len, high=max_len + 1, size=num_reqs) + return numbers.tolist() + + +def gen_random_tokens(ip_lens, tokenizer, random_seed): + def get_sample_from_population(population_range, sample_size): + # random.sample can not sample a value more than once. hence the check + if sample_size < len(population_range): + sample = random.sample(population_range, sample_size) + else: + sample = random.choices(population_range, k=sample_size) + + return sample + + input_ids = [] + random.seed(random_seed) + for ip_len in ip_lens: + start_ids = get_sample_from_population(range(0, tokenizer.vocab_size), ip_len) + # Make sure it does not contain EOS token + eos_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False) + while set(eos_id).issubset(start_ids): + tmp_id = (eos_id[0] + 1) % tokenizer.vocab_size + start_ids = [tmp_id if element == eos_id[0] else element for element in start_ids] + input_ids.append(start_ids) + + return input_ids + + +def write_dataset_to_file(dataset_generator, output_file): + output_file = Path(output_file) + os.makedirs(output_file.parent, exist_ok=True) + with open(output_file, "w") as f: + for item in dataset_generator: + f.write(item + "\n") diff --git a/tensorrt_llm/commands/bench.py b/tensorrt_llm/commands/bench.py index 29e570f43d..ab4755082f 100644 --- a/tensorrt_llm/commands/bench.py +++ b/tensorrt_llm/commands/bench.py @@ -7,6 +7,7 @@ from tensorrt_llm.bench.benchmark.low_latency import latency_command from tensorrt_llm.bench.benchmark.throughput import throughput_command from tensorrt_llm.bench.build.build import build_command from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment +from tensorrt_llm.bench.dataset.prepare_dataset import prepare_dataset from tensorrt_llm.logger import logger, severity_map @@ -65,6 +66,7 @@ def main( main.add_command(build_command) main.add_command(throughput_command) main.add_command(latency_command) +main.add_command(prepare_dataset) if __name__ == "__main__": main() diff --git a/tests/integration/defs/perf/README_release_test.md b/tests/integration/defs/perf/README_release_test.md index 0fdf4eaa85..2cfbc5ed7e 100644 --- a/tests/integration/defs/perf/README_release_test.md +++ b/tests/integration/defs/perf/README_release_test.md @@ -24,27 +24,25 @@ For trtllm-bench, the test extracts the following key performance metrics from l #### Without LoRA ```python -prepare_data_script = os.path.join(self._llm_root, "benchmarks", "cpp", "prepare_dataset.py") data_cmd += [ - "python3", prepare_data_script, "--stdout", - f"--tokenizer={tokenizer_dir}", f"token-norm-dist", - f"--num-requests={self._config.num_reqs}", - f"--input-mean={input_len}", f"--output-mean={output_len}", - f"--input-stdev={istdev}", f"--output-stdev={ostdev}", - f" > {dataset_path}" + "trtllm-bench", f"--model={tokenizer_dir}", + "prepare-dataset", "--output", dataset_path, "token-norm-dist", + f"--num-requests={self._config.num_reqs}", + f"--input-mean={input_len}", f"--output-mean={output_len}", + f"--input-stdev={istdev}", f"--output-stdev={ostdev}" ] ``` #### With LoRA ```python -"python3", prepare_data_script, f"--stdout", +"trtllm-bench", f"--model={tokenizer_dir}", + "prepare-dataset", "--output", dataset_path, f"--rand-task-id 0 {nloras-1}", - f"--tokenizer={tokenizer_dir}", f"--lora-dir={lora_dir}", + f"--lora-dir={lora_dir}", f"token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", - f"--input-stdev={istdev}", f"--output-stdev={ostdev}", - f" > {dataset_path}" + f"--input-stdev={istdev}", f"--output-stdev={ostdev}" ``` ### 2.2 PyTorch Configuration Generation diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index e6322c3221..def4cf505b 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -1933,6 +1933,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): "llama-7b-hf") if not os.path.exists(engine_dir): os.makedirs(engine_dir, exist_ok=True) + if self._config.num_loras > 0: istdev = 16 ostdev = 24 @@ -1958,14 +1959,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): self.lora_dirs.append(f"{lora_dir}/{i}") data_cmd += [f"ln -sf {lora_path} {lora_dir}/{i}", ";"] data_cmd += [ - "python3", prepare_data_script, f"--stdout", - f"--rand-task-id 0 {nloras-1}", - f"--tokenizer={tokenizer_dir}", f"--lora-dir={lora_dir}", + "trtllm-bench", f"--model={tokenizer_dir}", + "prepare-dataset", "--output", f"{dataset_path}", + f"--rand-task-id 0 {nloras-1}", f"--lora-dir={lora_dir}", f"token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", - f"--input-stdev={istdev}", f"--output-stdev={ostdev}", - f" > {dataset_path}" + f"--input-stdev={istdev}", f"--output-stdev={ostdev}" ] else: @@ -1978,12 +1978,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): dataset_path = os.path.join(engine_dir, "synthetic_data.json") if self._build_script == 'trtllm-bench': data_cmd += [ - "python3", prepare_data_script, "--stdout", - f"--tokenizer={tokenizer_dir}", f"token-norm-dist", + "trtllm-bench", f"--model={tokenizer_dir}", + "prepare-dataset", "--output", f"{dataset_path}", + "token-norm-dist", f"--num-requests={self._config.num_reqs}", f"--input-mean={input_len}", f"--output-mean={output_len}", - f"--input-stdev={istdev}", f"--output-stdev={ostdev}", - f" > {dataset_path}" + f"--input-stdev={istdev}", f"--output-stdev={ostdev}" ] else: data_cmd += [ diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py index d3c38ddb2d..2dcb5bf74f 100644 --- a/tests/integration/defs/perf/utils.py +++ b/tests/integration/defs/perf/utils.py @@ -730,8 +730,8 @@ class AbstractPerfScriptTestClass(abc.ABC): self._gpu_clock_lock = gpu_clock_lock tmpDir = temp_wd(self.get_working_dir()) - is_prepare_dataset_cmd = 'prepare_dataset' in commands.get_cmd_str( - cmd_idx) + cmd_str = commands.get_cmd_str(cmd_idx) + is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str is_perf_sanity_test = "perf_sanity" in full_test_name diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 61e3f72880..183c7c1760 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -494,16 +494,15 @@ class BenchRunner: return self.run_bench() def prepare_dataset(self): - dataset_tool = Path(self.llm_root, "benchmarks", "cpp", - "prepare_dataset.py") - # Generate a small dataset to run a test. self.work_dir.mkdir(parents=True) command = [ - f"{dataset_tool.resolve()}", - "--stdout", - "--tokenizer", + "trtllm-bench", + "--model", f"{self.model_path}", + "prepare-dataset", + "--output", + f"{self.dataset_path}", "token-norm-dist", "--input-mean", "128", @@ -517,13 +516,6 @@ class BenchRunner: str(self.num_requests), ] print(f"Running command: {' '.join(command)}") - dataset_output = self.llm_venv.run_cmd( - command, - caller=check_output, - ) - # Grab the stdout and write it to a dataset file for passing to suite. - with open(self.dataset_path, "w") as dataset: - dataset.write(dataset_output) def build_engine(self): if self.skip_engine_build: @@ -774,7 +766,6 @@ def trtllm_bench_prolog( stream_mode = "streaming" if streaming else "non-streaming" benchmark_name = f"trtllm-bench-sanity-{quant_name}-{stream_mode}" benchmark_name += "-pytorch-backend" if skip_engine_build else benchmark_name - dataset_tool = Path(llm_root, "benchmarks", "cpp", "prepare_dataset.py") work_dir = Path(tempfile.TemporaryDirectory().name ) if skip_engine_build else Path(engine_dir) @@ -783,29 +774,26 @@ def trtllm_bench_prolog( shutil.rmtree(work_dir, ignore_errors=True) # Generate a small dataset to run a test. work_dir.mkdir(parents=True) - dataset_output = llm_venv.run_cmd( - [ - f"{dataset_tool.resolve()}", - "--stdout", - "--tokenizer", - f"{model_path}", - "token-norm-dist", - "--input-mean", - "128", - "--output-mean", - "128", - "--input-stdev", - "0", - "--output-stdev", - "0", - "--num-requests", - "10", - ], - caller=check_output, - ) - # Grab the stdout and write it to a dataset file for passing to suite. - with open(dataset_path, "w") as dataset: - dataset.write(dataset_output) + dataset_cmd = [ + "trtllm-bench", + "--model", + f"{model_path}", + "prepare-dataset", + "--output", + f"{dataset_path}", + "token-norm-dist", + "--input-mean", + "128", + "--output-mean", + "128", + "--input-stdev", + "0", + "--output-stdev", + "0", + "--num-requests", + "10", + ] + check_output(" ".join(dataset_cmd), shell=True) if not skip_engine_build: build_cmd = \ diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py index a63eca22c9..4dbe980802 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py @@ -49,16 +49,16 @@ def run_benchmark( def prepare_dataset(root_dir: str, temp_dir: str, model_path_or_name: str): _DATASET_NAME = "synthetic_128_128.txt" dataset_path = Path(temp_dir, _DATASET_NAME) - dataset_tool = Path(root_dir, "benchmarks", "cpp", "prepare_dataset.py") script_dir = Path(root_dir, "benchmarks", "cpp") # Generate a small dataset to run a test - matching workload configuration command = [ - "python3", - f"{dataset_tool}", - "--stdout", - "--tokenizer", + "trtllm-bench", + "--model", model_path_or_name, + "prepare-dataset", + "--output", + f"{dataset_path}", "token-norm-dist", "--input-mean", "128", @@ -77,9 +77,7 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_path_or_name: str): ) if result.returncode != 0: raise RuntimeError(f"Failed to prepare dataset: {result.stderr}") - # Grab the stdout and write it to a dataset file for passing to suite. - with open(dataset_path, "w") as dataset: - dataset.write(result.stdout) + return dataset_path diff --git a/tests/unittest/tools/test_prepare_dataset.py b/tests/unittest/tools/test_prepare_dataset.py index df2c8e9d1b..948cde1e09 100644 --- a/tests/unittest/tools/test_prepare_dataset.py +++ b/tests/unittest/tools/test_prepare_dataset.py @@ -48,12 +48,12 @@ class TestPrepareDatasetLora: task_dir.mkdir(parents=True, exist_ok=True) yield str(lora_dir) - def _build_base_command(self, llm_root: Path) -> List[str]: + def _build_base_command(self, output_path: Path) -> List[str]: """ Build the base command for running prepare_dataset.py. Args: - llm_root: Path to the TensorRT LLM root directory + output_path: Path to the output dataset file Returns: List[str]: Base command components @@ -61,8 +61,7 @@ class TestPrepareDatasetLora: Raises: pytest.skip: If LLM_MODELS_ROOT is not available """ - script_path = llm_root / _PREPARE_DATASET_SCRIPT_PATH - cmd = ["python3", str(script_path)] + cmd = ["trtllm-bench"] # Add required tokenizer argument model_cache = llm_models_root() @@ -70,10 +69,10 @@ class TestPrepareDatasetLora: pytest.skip("LLM_MODELS_ROOT not available") tokenizer_dir = model_cache / _TOKENIZER_SUBPATH - cmd.extend(["--tokenizer", str(tokenizer_dir)]) + cmd.extend(["--model", str(tokenizer_dir)]) # Always add --stdout flag since we parse stdout output - cmd.extend(["--stdout"]) + cmd.extend(["prepare-dataset", "--output", f"{output_path}"]) return cmd @@ -109,7 +108,7 @@ class TestPrepareDatasetLora: str(_DEFAULT_OUTPUT_STDEV) ]) - def _run_prepare_dataset(self, llm_root: Path, **kwargs) -> str: + def _run_prepare_dataset(self, **kwargs) -> str: """ Execute prepare_dataset.py with specified parameters and capture output. @@ -124,13 +123,20 @@ class TestPrepareDatasetLora: Raises: subprocess.CalledProcessError: If the command execution fails """ - cmd = self._build_base_command(llm_root) - self._add_lora_arguments(cmd, **kwargs) - self._add_synthetic_data_arguments(cmd) + with tempfile.TemporaryDirectory() as temp_dir: + output_path = Path(temp_dir) / "dataset.jsonl" + cmd = self._build_base_command(output_path) + self._add_lora_arguments(cmd, **kwargs) + self._add_synthetic_data_arguments(cmd) - # Execute command and capture output - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - return result.stdout + # Execute command and capture output + subprocess.run(cmd, check=True, cwd=temp_dir) + + data = "" + with open(output_path, "r") as f: + data = f.read() + + return data def _parse_json_output(self, output: str) -> List[Dict[str, Any]]: """ @@ -198,7 +204,7 @@ class TestPrepareDatasetLora: }, id="random_task_id") ]) - def test_lora_metadata_generation(self, llm_root: Path, temp_lora_dir: str, + def test_lora_metadata_generation(self, temp_lora_dir: str, test_params: Dict) -> None: """Test LoRA metadata generation with various configurations.""" # Extract test parameters @@ -213,7 +219,7 @@ class TestPrepareDatasetLora: if rand_task_id is not None: kwargs["rand_task_id"] = rand_task_id - output = self._run_prepare_dataset(llm_root, **kwargs) + output = self._run_prepare_dataset(**kwargs) json_data = self._parse_json_output(output) assert len(json_data) > 0, f"No JSON data generated for {description}" From 0a09465089109aaf667bc16c3543eff6a85b8aaf Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Mon, 8 Dec 2025 11:16:05 -0800 Subject: [PATCH 015/172] [https://nvbugs/5567586][feat] Ampere xqa swa specdec for GPT-OSS Eagle3-one-model (#8383) Signed-off-by: Jhao-Ting Chen --- cpp/kernels/xqa/mha.cu | 76 +++++++++++++--- jenkins/L0_Test.groovy | 1 + .../_torch/attention_backend/trtllm.py | 11 ++- .../defs/accuracy/test_llm_api_pytorch.py | 88 ++++++++++++++++++- .../test_lists/qa/llm_function_core.txt | 24 ++--- .../qa/llm_function_core_sanity.txt | 24 ++--- .../test_lists/qa/llm_function_nim.txt | 24 ++--- .../test_lists/test-db/l0_dgx_b200.yml | 17 ++-- .../test_lists/test-db/l0_dgx_h100.yml | 27 ++++-- .../test_lists/test-db/l0_rtx_pro_6000.yml | 4 + 10 files changed, 225 insertions(+), 71 deletions(-) diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu index 89eb935cf3..330364ee88 100644 --- a/cpp/kernels/xqa/mha.cu +++ b/cpp/kernels/xqa/mha.cu @@ -466,20 +466,53 @@ using WarpAcc = WarpAccT; #define MMAS_N_PER_MASK 2 __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskType const* mask, uint32_t rowOffset, - uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize) + uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + , + int32_t tok0WinBeg, uint32_t seqIter, uint32_t const cacheSeqLen, uint32_t const warpTileTokenBeg +#endif +) { uint32_t const idxInQuad = laneId() % 4; uint32_t const idxQuad = laneId() / 4; // Packed mask is aligned with 32 bits (2 uint16_t). uint32_t const nbPackedMasksPerRow = divUp(qSeqLen, 32u) * 2u; uint16_t const* uint16Mask = reinterpret_cast(mask); + constexpr uint64_t fullMask = ~uint64_t{0}; +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + Range const tileRange = {warpTileTokenBeg, warpTileTokenBeg + warpTile.x}; + Range const maxMaskOutRange = {0, mha::max(0, tok0WinBeg) + (nbValidRows / MMAS_N_PER_MASK - 1)}; + bool const ctaNeedBegMask = tileRange.beg < maxMaskOutRange.end; + assert(ctaNeedBegMask == overlap(tileRange, maxMaskOutRange)); + int32_t const tok0NbMaskOut = int32_t(tok0WinBeg) - int32_t(warpTileTokenBeg); + uint32_t const nbSeqItersWithoutSpecDecMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x; + bool const ctaNeedSpecDecMask = (seqIter >= nbSeqItersWithoutSpecDecMask); +#else + constexpr bool ctaNeedBegMask = false; + bool const ctaNeedSpecDecMask = true; + int32_t const tok0NbMaskOut = -2147483648; +#endif + bool const needMask = ctaNeedBegMask || ctaNeedSpecDecMask; + + if (!needMask) + { + return; + } #pragma unroll for (uint32_t m = 0; m < acc.rows; m++) { #pragma unroll for (uint32_t i = 0; i < InstAcc::rows; i++) { - uint32_t const tokenRow = min((rowOffset + instM * m + idxQuad + i * 8) / headGrpSize, actualQSeqLen - 1); + uint32_t const idxQTokInCta = (rowOffset + instM * m + idxQuad + i * 8) / headGrpSize; + uint32_t const tokenRow = min(idxQTokInCta, actualQSeqLen - 1); +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + int32_t const begNbMaskOut = tok0NbMaskOut + int32_t(idxQTokInCta); + uint64_t const begMask = (begNbMaskOut > 0 ? fullMask << begNbMaskOut : fullMask); +#else + uint64_t const begMask = fullMask; +#endif + #pragma unroll for (uint32_t mask_n = 0; mask_n < acc.cols / MMAS_N_PER_MASK; mask_n++) { @@ -491,12 +524,15 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy uint32_t const maskPos1 = lastCol + actualQSeqLen < nbValidCols ? 0u : min(lastCol + actualQSeqLen - nbValidCols, actualQSeqLen - 1); - uint32_t packedMask = 0u; uint32_t const maskPosStart = (maskPos0 / 16) * 16; - reinterpret_cast(&packedMask)[0] - = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)]; - reinterpret_cast(&packedMask)[1] - = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)]; + uint32_t packedMask = ~uint32_t{0}; + if (ctaNeedSpecDecMask) + { + reinterpret_cast(&packedMask)[0] + = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)]; + reinterpret_cast(&packedMask)[1] + = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)]; + } #pragma unroll for (uint32_t nj = 0; nj < MMAS_N_PER_MASK; nj++) { @@ -510,7 +546,11 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy bool const maskFlag = col + actualQSeqLen < nbValidCols ? true : packedMask & (1u << ((col + actualQSeqLen - nbValidCols) - maskPosStart)); - acc(m, n)(i, j) = maskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax; + + bool const begMaskFlag = ctaNeedBegMask ? (begMask & (1ULL << col)) : true; + + acc(m, n)(i, j) + = maskFlag && begMaskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax; } } } @@ -1611,8 +1651,14 @@ CUBIN_EXPORT __global__ #endif uint32_t const cacheSeqLen = getCacheSeqLen(cacheList, idxReq); -#if SLIDING_WINDOW +#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE + uint32_t const tok0SeqLen = cacheSeqLen - actualQSeqLen + 1 + idxHeadTokenInGrp; // ctaTokOffset; + int32_t const tok0WinBeg = int32_t(tok0SeqLen) - int32_t(slidingWinSize); + uint32_t const nbTotalSkipTokens = mha::max(0, tok0WinBeg); + +#elif SLIDING_WINDOW bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize); + assert(!SPEC_DEC || !rtIsReallySliding); uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0; #else constexpr bool rtIsReallySliding = false; @@ -1626,7 +1672,9 @@ CUBIN_EXPORT __global__ #endif uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0; -#if SPEC_DEC +#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE + uint32_t const nbSeqItersWithoutMask = nbSkipLeadingTiles; +#elif SPEC_DEC uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x; #endif @@ -1912,8 +1960,12 @@ CUBIN_EXPORT __global__ if (seqIter >= nbSeqItersWithoutMask) { uint32_t const nbValidCols = (warpTileTokenBeg < cacheSeqLen ? cacheSeqLen - warpTileTokenBeg : 0U); - applyMaskFromInput( - warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize); + applyMaskFromInput(warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + , + tok0WinBeg, seqIter, cacheSeqLen, warpTileTokenBeg +#endif + ); } #else bool const isFirstIter = (seqIter == nbSkipLeadingTiles); diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 26c7716ba8..2e3af6fa36 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2895,6 +2895,7 @@ def launchTestJobs(pipeline, testFilter) x86SlurmTestConfigs = [ "DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], + "DGX_H100-2_GPUs-PyTorch-GptOss-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], "DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index d754eb701a..6e14627d85 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -475,7 +475,7 @@ class TrtllmAttentionWrapper: self.spec_decoding_generation_lengths, self.spec_decoding_position_offsets, self.spec_decoding_packed_mask ] - if get_sm_version() >= 100: + if self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version()): spec_decoding_tensor_params.append( self.spec_decoding_bl_tree_mask_offset) spec_decoding_tensor_params.append(self.spec_decoding_bl_tree_mask) @@ -1219,12 +1219,12 @@ class TrtllmAttentionMetadata(AttentionMetadata): # spec_dec mode should only be enabled for non-sm100 machines and when there's a spec-dec tree. self.is_spec_decoding_enabled = is_spec_decoding_enabled and ( - get_sm_version() < 100 or get_sm_version() == 120) + not self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version())) self.is_spec_dec_tree = spec_tree_manager is not None self.is_spec_dec_dynamic_tree = spec_tree_manager is not None and spec_tree_manager.use_dynamic_tree - if get_sm_version() >= 100 and get_sm_version() != 120: + if self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version()): if self.is_spec_dec_tree or self.is_spec_dec_dynamic_tree: assert not self.is_spec_dec_tree, "Spec-dec tree is not supported on this machine. Please use a pre-Blackwell machine for a spec-dec tree." @@ -1260,7 +1260,7 @@ class TrtllmAttentionMetadata(AttentionMetadata): device='cuda', ) - if get_sm_version() >= 100: + if self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version()): self.spec_decoding_param_prepare_for_blackwell() else: self.spec_decoding_bl_tree_mask_offset = None @@ -1371,6 +1371,9 @@ class TrtllmAttentionMetadata(AttentionMetadata): self.spec_decoding_generation_lengths[:self.max_num_requests].copy_( spec_decoding_generation_length, non_blocking=True) + def is_sm_version_trtllm_gen_kernel(self, sm): + return not (sm < 100 or sm in [120, 121]) + class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 24bc65b5e1..35e60e0436 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4248,14 +4248,16 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): ["CUTLASS", pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], ids=["cutlass", "trtllm", "triton"]) - def test_eagle3(self, moe_backend, one_model, overlap_scheduler, mocker): + def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler, + mocker): if moe_backend == "TRITON": if not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("Triton kernels are not available") - if get_sm_version() == 90 and moe_backend == "CUTLASS": + if get_sm_version() == 90: pytest.skip( - "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue") + "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4" + ) MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 @@ -4318,6 +4320,86 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): sampling_params=sampling_params, extra_evaluator_kwargs=extra_evaluator_kwargs) + @pytest.mark.skip_less_device(2) + @pytest.mark.timeout(14400) + @pytest.mark.parametrize("overlap_scheduler", [True, False], + ids=["overlap_scheduler", "no_overlap_scheduler"]) + @pytest.mark.parametrize("one_model", [True, False], + ids=["one_model", "two_model"]) + @pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], + ids=["cutlass", "trtllm", "triton"]) + def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler, + mocker): + if moe_backend == "TRITON": + if not IS_TRITON_KERNELS_AVAILABLE: + pytest.skip("Triton kernels are not available") + + MAX_OUTPUT_LEN = 128179 + MAX_INPUT_LEN = 32768 + + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) + mocker.patch.dict(GSM8K.EVALUATE_KWARGS, + {"scores_filter": "exact_match,flexible-extract"}) + + mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN) + mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN) + + # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue + pytorch_config = dict( + max_batch_size=8, + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig(max_batch_size=8)) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + dtype="auto") + + eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" + draft_len = 3 + spec_config = EagleDecodingConfig(max_draft_len=draft_len, + speculative_model_dir=eagle_model_dir, + eagle3_one_model=one_model) + + max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN + llm = LLM(self.MODEL_PATH, + tensor_parallel_size=2, + pipeline_parallel_size=1, + moe_expert_parallel_size=1, + kv_cache_config=kv_cache_config, + max_seq_len=max_seq_len, + speculative_config=spec_config, + **pytorch_config, + enable_attention_dp=False, + moe_config=MoeConfig(backend=moe_backend)) + + with llm: + model_name = "GPT-OSS/120B-MXFP4" + + # GSM8K + task = GSM8K(model_name) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) + + # GPQA Medium Reasoning + task = GPQADiamond(model_name) + + chat_template_kwargs = dict(reasoning_effort="medium") + extra_evaluator_kwargs = { + **self.extra_evaluator_kwargs, "chat_template_kwargs": + chat_template_kwargs + } + + sampling_params = SamplingParams( + temperature=1.0, + top_p=1.0, + max_tokens=MAX_OUTPUT_LEN, + truncate_prompt_tokens=MAX_INPUT_LEN) + + task.evaluate(llm, + sampling_params=sampling_params, + extra_evaluator_kwargs=extra_evaluator_kwargs) + @pytest.mark.skip_less_device(4) @pytest.mark.skip_device_not_contain(["GB200"]) @pytest.mark.parametrize( diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 92653058cd..7e58d0f500 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -566,18 +566,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 6d32579f04..f468d262b1 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -103,18 +103,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 77f0563016..357cc80a05 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -342,18 +342,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 7bac4b180f..04a4278ba6 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -57,8 +57,9 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 @@ -207,12 +208,12 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 7e50c6ebf8..c544501a9d 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -45,6 +45,25 @@ l0_dgx_h100: - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] # llmapi - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks +- condition: + ranges: + system_gpu_count: + gte: 2 + lte: 2 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: pytorch + auto_trigger: gpt_oss + orchestrator: mpi + tests: + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[triton-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[triton-two_model-overlap_scheduler] - condition: ranges: system_gpu_count: @@ -186,14 +205,6 @@ l0_dgx_h100: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml index 58200ca901..63deed9f86 100644 --- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml +++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml @@ -109,3 +109,7 @@ l0_rtx_pro_6000: # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] From faabc1a3873d1a2ab6dc100644313298bd2250f8 Mon Sep 17 00:00:00 2001 From: Yibin Li <109242046+yibinl-nvidia@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:57:32 -0800 Subject: [PATCH 016/172] [TRTLLM-7967][chore] Add more tests (#9415) Signed-off-by: Yibin Li <109242046+yibinl-nvidia@users.noreply.github.com> --- .../test_lists/qa/llm_perf_core.yml | 5 +- tests/integration/test_lists/waives.txt | 1 - .../modeling/test_modeling_starcoder2.py | 115 +++++++++++++++++- 3 files changed, 118 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index b059ee3f8c..b8f8b1f222 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -65,7 +65,10 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128] - perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-pytorch-float16-input_output_len:128,128] - - perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-float16-input_output_len:512,200] + - perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-bfloat16-input_output_len:512,200] + - perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-bfloat16-input_output_len:500,2000-con:50] + - perf/test_perf.py::test_perf[starcoder2_7b-bench-pytorch-bfloat16-input_output_len:500,2000-con:50] + - perf/test_perf.py::test_perf[starcoder2_15b-bench-pytorch-bfloat16-input_output_len:500,2000-con:100] - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128] # Ministral-8B diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 28714e45c0..f89284e126 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -172,7 +172,6 @@ perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] SKIP (https://nvbugspro.nvidia.com/bug/5295390) perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-float16-input_output_len:512,200] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[bart_large_cnn-bench-float16-input_output_len:128,20] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) diff --git a/tests/unittest/_torch/modeling/test_modeling_starcoder2.py b/tests/unittest/_torch/modeling/test_modeling_starcoder2.py index 3eec8dc1e8..82dc9abf85 100644 --- a/tests/unittest/_torch/modeling/test_modeling_starcoder2.py +++ b/tests/unittest/_torch/modeling/test_modeling_starcoder2.py @@ -3,11 +3,15 @@ from dataclasses import dataclass import pytest import torch -from transformers import Starcoder2Config +from peft import LoraConfig as PeftLoraConfig +from peft import get_peft_model +from transformers import AutoModelForCausalLM, Starcoder2Config from transformers import Starcoder2ForCausalLM as HFStarcoder2ForCausalLM +from utils.llm_data import llm_models_root from utils.util import default_dtype import tensorrt_llm +from tensorrt_llm import LLM from tensorrt_llm._torch.attention_backend.utils import get_attention_backend from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig @@ -15,7 +19,10 @@ from tensorrt_llm._torch.models.modeling_starcoder2 import Starcoder2ForCausalLM from tensorrt_llm._torch.modules.layer_norm import LayerNorm from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig +from tensorrt_llm.executor.request import LoRARequest +from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.mapping import Mapping +from tensorrt_llm.sampling_params import SamplingParams # Base config for all StarCoder2 models (based on HuggingFace configs) _STARCODER2_BASE_CONFIG = { @@ -311,3 +318,109 @@ def test_starcoder2_allclose_to_hf(scenario: Scenario) -> None: if graph_runner is not None: graph_runner.clear() kv_cache_manager.shutdown() + + +@torch.no_grad() +def test_starcoder2_multi_lora(tmp_path) -> None: + """ + Test StarCoder2 3b model with multiple synthetic LoRA adapters created using PEFT. + + This test creates dummy LoRA adapters for StarCoder2 and verifies that: + 1. Multiple LoRA adapters can be loaded and used simultaneously + 2. Different requests can use different LoRA adapters + 3. The model produces reasonable outputs with LoRA adapters applied + """ + + # Check if we have enough GPU memory (need ~10GB for StarCoder2-3B + LoRA) + _, total_mem = torch.cuda.mem_get_info() + min_mem_required = 10 * (2**30) # 10 GB + if total_mem < min_mem_required: + pytest.skip("Insufficient GPU memory for StarCoder2 with LoRA test") + + # Check for pretrained model + model_path = f"{llm_models_root()}/starcoder2-3b" + + # Target modules for LoRA - attention projections + target_modules = ["attn_q", "attn_k", "attn_v", "attn_dense"] + + # Load the pretrained model to create LoRA adapters + model = AutoModelForCausalLM.from_pretrained( + model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True + ) + + # HuggingFace module names for StarCoder2 attention + hf_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] + + peft_lora_config = PeftLoraConfig( + r=8, # LoRA rank + lora_alpha=16, + target_modules=hf_modules, + lora_dropout=0.0, + bias="none", + task_type="CAUSAL_LM", + ) + + # Create two synthetic LoRA adapters with zeroed weights + lora_paths = [] + for i in range(2): + lora_model = get_peft_model(model, peft_lora_config) + + # Zero out all LoRA parameters for deterministic testing + for name, param in lora_model.named_parameters(): + if "lora_" in name: + param.data.zero_() + + # Save the LoRA adapter + lora_path = tmp_path / f"lora_{i}" + lora_model.save_pretrained(lora_path) + lora_paths.append(str(lora_path)) + + del model + del lora_model + torch.cuda.empty_cache() + + # Configure TensorRT-LLM LoRA + trtllm_lora_config = LoraConfig( + lora_target_modules=target_modules, max_lora_rank=8, max_loras=2, max_cpu_loras=2 + ) + + llm = LLM( + model_path, + lora_config=trtllm_lora_config, + # Disable CUDA graph for LoRA (LoRA is not supported with CUDA graphs yet) + cuda_graph_config=None, + ) + + with llm: + prompts = [ + "def fibonacci(n):", + "def quick_sort(arr):", + ] + + lora_req1 = LoRARequest("lora-1", 0, lora_paths[0]) + lora_req2 = LoRARequest("lora-2", 1, lora_paths[1]) + lora_requests = [lora_req1, lora_req2] + + # Sampling parameters + sampling_params = SamplingParams( + max_tokens=50, + temperature=0.0, # Greedy decoding for deterministic output + ) + + outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests) + + # Verify we got outputs for both prompts + assert len(outputs) == 2, f"Expected 2 outputs, got {len(outputs)}" + + # Verify each output has text + for i, output in enumerate(outputs): + assert len(output.outputs) > 0, f"Output {i} has no results" + assert len(output.outputs[0].text) > 0, f"Output {i} generated empty text" + + # Test without LoRA for comparison + outputs_no_lora = llm.generate(prompts, sampling_params, lora_request=None) + + assert len(outputs_no_lora) == 2 + + assert outputs[0].outputs[0].text == outputs_no_lora[0].outputs[0].text + assert outputs[1].outputs[0].text == outputs_no_lora[1].outputs[0].text From f9380581c5073cb2f96aa331d180ab408275d7ae Mon Sep 17 00:00:00 2001 From: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> Date: Mon, 8 Dec 2025 15:11:44 -0600 Subject: [PATCH 017/172] [https://nvbugs/5508267][fix] Proper handling of inactive canceled requests (#9280) Signed-off-by: thorjohnsen <41591019+thorjohnsen@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index f6cf6d4cb5..430a4dcd08 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2236,9 +2236,14 @@ class PyExecutor: # Remove cancel request in the waiting queue self.executor_request_queue.update_waiting_queue() + # Create set from list of canceled request ids to speed up canceled test + canceled_req_ids = set( + self.executor_request_queue.get_canceled_req_ids()) + + still_pending_canceled_ids = [] for request in self.active_requests: req_id = request.py_request_id if not request.is_child else request.parent_request_id - if req_id not in self.executor_request_queue.get_canceled_req_ids(): + if req_id not in canceled_req_ids: continue is_cancelled = self._try_cancel_request(request) @@ -2247,13 +2252,13 @@ class PyExecutor: # to clean up the KV cache resources. request.finish_by_reason(FinishReason.CANCELLED) request.decoding_iter = request.py_decoding_iter - self.executor_request_queue.canceled_req_ids.remove(req_id) + else: + still_pending_canceled_ids.append(req_id) - if self.enable_attention_dp: - # TODO: revisit the cancel logic of attention dp - # When enable attention dp, each rank does not have full copy of requests - # so we need to remove the cancel requests not in the local rank - self.executor_request_queue.clear_canceled_req_ids() + # Clear list of requests marked for cancellation and add back those that failed to cancel. + self.executor_request_queue.canceled_req_ids.clear() + self.executor_request_queue.canceled_req_ids.extend( + still_pending_canceled_ids) @nvtx_range("_enqueue_responses") def _enqueue_responses(self, responses: Iterable[Tuple[int, LlmResponse]]): From 23cf72b0f824b2d13cb8b098ff4edb03a8a7a454 Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Mon, 8 Dec 2025 23:12:56 +0200 Subject: [PATCH 018/172] [#8921][feat] Added symetric memory AllReduce strategy (#8919) Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- tensorrt_llm/_torch/distributed/ops.py | 70 ++++- .../_torch/distributed/symm_mem_allreduce.py | 240 ++++++++++++++++++ tensorrt_llm/functional.py | 1 + .../multigpu/test_ad_allreduce_strategies.py | 2 + 4 files changed, 306 insertions(+), 7 deletions(-) create mode 100644 tensorrt_llm/_torch/distributed/symm_mem_allreduce.py diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index ee104d07a9..fa8e61f322 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -7,6 +7,8 @@ from typing import List, Optional, Tuple, Union import torch from torch import nn +from tensorrt_llm._torch.distributed.symm_mem_allreduce import \ + SymmetricMemoryAllReduce from tensorrt_llm._utils import mpi_comm, mpi_disabled from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams, @@ -567,13 +569,17 @@ class AllReduce(nn.Module): strategy (AllReduceStrategy): The following all-reduce strategies are supported: + - SYMM_MEM: Uses PyTorch's symmetric memory with MULTIMEM hardware instructions. + Falls back automatically if not supported. + - UB: AllReduce uses user-buffer based all-reduce kernel. - NCCL: Use NCCL allreduce. - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel. - - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy. + - AUTO: AUTO chooses the best available strategy. Will try MNNVL, + then choose between NCCL and MIN_LATENCY based on a heuristic policy. - LOWPRECISION: AllReduce quantizes data to lower precision for transmission. Should only be used on topologies with PCIe switches and without NVLink. @@ -602,12 +608,42 @@ class AllReduce(nn.Module): self.workspace = None self.strategy = strategy self.mnnvl_allreduce = None + self.symm_mem_allreduce = None self._disable_mpi = mpi_disabled() self.all_reduce_op = torch.ops.trtllm.allreduce_pg if self._disable_mpi else torch.ops.trtllm.allreduce if self.mapping.tp_size > 1: - # When Strategy is UB, it is guaranteed that the workspace is not used. + # Initialize Symmetric Memory AllReduce if needed (before workspace allocation) + if self.strategy == AllReduceStrategy.SYMM_MEM: + try: + symm_mem = SymmetricMemoryAllReduce( + self.mapping, + dtype=dtype if dtype else torch.bfloat16, + ) + if not symm_mem.disabled: + self.symm_mem_allreduce = symm_mem + logger.info( + f"SymmetricMemoryAllReduce (MULTIMEM) is enabled with fallback support for world_size={self.mapping.tp_size}" + ) + # Keep SYMM_MEM strategy but allocate workspace for fallback to regular allreduce + else: + logger.info( + f"SymmetricMemoryAllReduce is disabled (not supported or unavailable), falling back to AUTO strategy" + ) + # Fall back to AUTO if SYMM_MEM can't be enabled + self.strategy = AllReduceStrategy.AUTO + except Exception as e: + logger.info( + f"Symmetric Memory AllReduce can't be enabled due to {e}, falling back to AUTO strategy" + ) + self.symm_mem_allreduce = None + # Fall back to AUTO if SYMM_MEM initialization fails + self.strategy = AllReduceStrategy.AUTO + + # Allocate workspace for strategies that need it + # Note: SYMM_MEM now also needs workspace for fallback scenarios (fused ops, etc.) + # Only UB doesn't need workspace if self.strategy != AllReduceStrategy.UB: if self.strategy == AllReduceStrategy.LOWPRECISION: allocate_low_presicion_allreduce_workspace(self.mapping) @@ -616,9 +652,10 @@ class AllReduce(nn.Module): AllReduceStrategy.NCCL_SYMMETRIC): self.workspace = get_allreduce_workspace(self.mapping) - # Initialize MNNVL AllReduce if needed + # Initialize MNNVL if using AUTO or MNNVL strategy if self.strategy in (AllReduceStrategy.AUTO, AllReduceStrategy.MNNVL): + # Try to initialize MNNVL if MNNVLAllReduce.is_mnnvl(self.mapping, dtype): # ALWAYS capture the exception when creating this instance try: @@ -674,20 +711,39 @@ class AllReduce(nn.Module): if all_reduce_params is None: all_reduce_params = AllReduceParams() - # Try MNNVL AllReduce first if available + # Try Symmetric Memory AllReduce first if available + # Note: Currently only supports NONE fusion op (plain allreduce) + if self.symm_mem_allreduce and all_reduce_params.fusion_op == AllReduceFusionOp.NONE: + symm_mem_output = self.symm_mem_allreduce(input) + if symm_mem_output is not None: + logger.debug( + f"Using SymmetricMemoryAllReduce (MULTIMEM) for input shape {input.shape}" + ) + return symm_mem_output + elif self.symm_mem_allreduce and all_reduce_params.fusion_op != AllReduceFusionOp.NONE: + # Log once per rank that we're skipping symm_mem due to fusion + logger.debug_once( + f"Skipping SymmetricMemoryAllReduce for fused operation (fusion_op={all_reduce_params.fusion_op}), using regular allreduce", + key=(self.mapping.tp_rank, all_reduce_params.fusion_op, + "debug_fusion_skip"), + ) + + # Try MNNVL AllReduce if symm_mem didn't handle it if self.mnnvl_allreduce: mnnvl_output = self.mnnvl_allreduce( input, all_reduce_params=all_reduce_params) if mnnvl_output is not None: return mnnvl_output - # Fall back to regular AllReduce if MNNVL is not available or not applicable - # Make sure the strategy is AUTO since allreduceOp does not have the branch for MNNVL - if allreduce_strategy == AllReduceStrategy.MNNVL: + # Fall back to regular AllReduce if specialized methods are not available or not applicable + # Make sure the strategy is AUTO since allreduceOp does not have the branch for MNNVL/SYMM_MEM + if allreduce_strategy in (AllReduceStrategy.MNNVL, + AllReduceStrategy.SYMM_MEM): allreduce_strategy = AllReduceStrategy.AUTO additional_args = {} if self._disable_mpi: + # Get ProcessGroup from mapping pg = self.mapping.tp_group_pg assert pg is not None, "TP ProcessGroup not initialised" additional_args = { diff --git a/tensorrt_llm/_torch/distributed/symm_mem_allreduce.py b/tensorrt_llm/_torch/distributed/symm_mem_allreduce.py new file mode 100644 index 0000000000..25e70001ed --- /dev/null +++ b/tensorrt_llm/_torch/distributed/symm_mem_allreduce.py @@ -0,0 +1,240 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +""" +Symmetric Memory AllReduce + +This module provides PyTorch Symmetric Memory-based allreduce operations, +leveraging MULTIMEM hardware instructions. +""" + +from typing import Optional + +import torch +import torch.distributed as dist +from torch import nn + +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping + +try: + import torch.distributed._symmetric_memory as torch_symm_mem + + SYMM_MEM_AVAILABLE = True +except ImportError: + SYMM_MEM_AVAILABLE = False + logger.warning( + "PyTorch symmetric memory not available. Install PyTorch >= 2.8 for MULTIMEM support." + ) + + +class SymmetricMemoryAllReduce(nn.Module): + """ + AllReduce implementation using PyTorch's symmetric memory operations. + This leverages MULTIMEM hardware instructions for faster allreduce operations. + + Supported configurations (world_size): + - SM 9.0: 4, 6, 8 GPUs + - SM 10.0: 6, 8 GPUs + + """ + + # World sizes that support MULTIMEM instructions + _WORLD_SIZES_MULTIMEM = { + "9.0": [4, 6, 8], + "10.0": [6, 8], + } + + MiB = 1024 * 1024 + # Maximum buffer sizes for symmetric memory (bytes) + _MAX_SIZES = { + "9.0": { + 2: 64 * MiB, # 64 MB + 4: 32 * MiB, # 32 MB + 6: 64 * MiB, # 64 MB + 8: 64 * MiB, # 64 MB + }, + "10.0": { + 2: 8 * MiB, # 8 MB + 4: 32 * MiB, # 32 MB + 6: 128 * MiB, # 128 MB + 8: 128 * MiB, # 128 MB + }, + } + + def __init__( + self, + mapping: Mapping, + dtype: torch.dtype = torch.bfloat16, + group: Optional[dist.ProcessGroup] = None, + ): + super().__init__() + + self.disabled = True + self.mapping = mapping + self.dtype = dtype + self.world_size = mapping.tp_size + + if not SYMM_MEM_AVAILABLE: + logger.warning("SymmetricMemoryAllReduce: PyTorch symm_mem not available") + return + + if not torch.cuda.is_available(): + logger.warning("SymmetricMemoryAllReduce: CUDA not available") + return + + # Get device capability + device = torch.device(f"cuda:{mapping.tp_rank}") + capability = torch.cuda.get_device_capability(device) + self.device_capability = f"{capability[0]}.{capability[1]}" + + # Check if this configuration is supported + if self.device_capability not in self._MAX_SIZES: + logger.warning( + f"SymmetricMemoryAllReduce: Device capability {self.device_capability} not supported" + ) + return + + if self.world_size not in self._MAX_SIZES[self.device_capability]: + logger.info( + f"SymmetricMemoryAllReduce: World size {self.world_size} not supported " + f"for SM {self.device_capability}" + ) + return + + # Get max buffer size for this configuration + self.max_size = self._MAX_SIZES[self.device_capability][self.world_size] + + # Set up process group + self.group = group + if self.group is None: + # Get or create TP group with correct ranks + # For TP parallelism, we need ranks [0, 1, 2, ..., tp_size-1] globally + # NOT starting from tp_rank! + if not dist.is_initialized(): + logger.warning("SymmetricMemoryAllReduce: torch.distributed not initialized") + self.disabled = True + return + # Get actual TP group ranks from mapping (tp_group is a property, not a method) + tp_group_ranks = mapping.tp_group + self.group = dist.new_group(tp_group_ranks) if len(tp_group_ranks) > 1 else None + + # Enable symmetric memory for this group + try: + # Get group_name - this may fail if ProcessGroup doesn't have group_name set + if not hasattr(self.group, "group_name"): + logger.warning( + "SymmetricMemoryAllReduce: ProcessGroup does not have group_name attribute" + ) + self.disabled = True + return + + group_name_str = str(self.group.group_name) + torch_symm_mem.enable_symm_mem_for_group(group_name_str) + logger.debug( + f"SymmetricMemoryAllReduce: Enabled symmetric memory for group {group_name_str}" + ) + except Exception as e: + logger.warning( + f"SymmetricMemoryAllReduce: Failed to enable symmetric memory for group: {e}" + ) + self.disabled = True + return + + # Allocate symmetric memory buffer + try: + self.buffer = torch_symm_mem.empty( + self.max_size // self.dtype.itemsize, + device=device, + dtype=self.dtype, + ) + # Pass group name string + group_name_str = str(self.group.group_name) + handle = torch_symm_mem.rendezvous(self.buffer, group_name_str) + + if handle.multicast_ptr == 0: + logger.warning( + "SymmetricMemoryAllReduce: MULTIMEM operations not supported (multicast_ptr is 0)" + ) + return + + # Only enable if MULTIMEM is supported + # Otherwise, no benefit over existing TensorRT-LLM strategies + use_multimem = self.world_size in self._WORLD_SIZES_MULTIMEM.get( + self.device_capability, [] + ) + + if not use_multimem: + logger.info( + f"SymmetricMemoryAllReduce: MULTIMEM not supported for " + f"world_size={self.world_size}, SM={self.device_capability}. " + f"Falling back to standard allreduce strategies." + ) + return + + self.disabled = False + logger.info( + f"SymmetricMemoryAllReduce (MULTIMEM) initialized: " + f"world_size={self.world_size}, " + f"max_size={self.max_size}, " + f"SM={self.device_capability}" + ) + + except Exception as e: + logger.warning(f"SymmetricMemoryAllReduce initialization failed: {e}") + return + + @property + def process_group(self) -> Optional[dist.ProcessGroup]: + """Expose the ProcessGroup for use in fallback scenarios.""" + return self.group if not self.disabled else None + + def can_use_symm_mem(self, inp: torch.Tensor) -> bool: + """Check if symmetric memory can be used for this tensor.""" + if self.disabled: + return False + if inp.dtype != self.dtype: + return False + inp_size = inp.numel() * inp.element_size() + if inp_size % 4 != 0: + return False + if inp_size >= self.max_size: + return False + return True + + def forward( + self, + inp: torch.Tensor, + out: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Perform allreduce using symmetric memory operations. + + Args: + inp: Input tensor to reduce + out: Optional output tensor (if None, will be allocated) + + Returns: + Reduced tensor + """ + if not self.can_use_symm_mem(inp): + return None # Caller should fall back to other strategy + + if out is None: + out = torch.empty_like(inp) + + # Copy input to symmetric memory buffer + self.buffer[: inp.numel()].copy_(inp.view(-1)) + + # Perform MULTIMEM allreduce + # Pass group name string (matching vLLM's implementation) + group_name_str = str(self.group.group_name) + torch.ops.symm_mem.multimem_all_reduce_( + self.buffer[: inp.numel()], + "sum", + group_name_str, + ) + + # Copy result back + out.copy_(self.buffer[: inp.numel()].view(out.shape)) + + return out diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index b4c986fd6a..f341d75220 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -3881,6 +3881,7 @@ class AllReduceStrategy(IntEnum): LOWPRECISION = 6 MNNVL = 7 NCCL_SYMMETRIC = 8 + SYMM_MEM = 9 # PyTorch symmetric memory with MULTIMEM class AllReduceFusionOp(IntEnum): diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py index cab8b345b9..a8b4638ade 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py @@ -115,6 +115,7 @@ def _prepare_dataset(root_dir: str, temp_dir: str, model_path_or_name: str, num_ "TWOSHOT", "MIN_LATENCY", "NCCL", + "SYMM_MEM", ], ) def test_allreduce_strategies(llm_root, shared_dataset, allreduce_strategy): # noqa: F811 @@ -230,6 +231,7 @@ def test_allreduce_strategies(llm_root, shared_dataset, allreduce_strategy): # "NCCL", "TWOSHOT", "MIN_LATENCY", + "SYMM_MEM", ], ) def test_allreduce_strategy_propagation(strategy): From da074be037a201470c27831c73532fb9dda508bb Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Mon, 8 Dec 2025 13:31:37 -0800 Subject: [PATCH 019/172] [None][fix] Fix #8383 introduced TRTLLM backend python error (#9804) Signed-off-by: Jhao-Ting Chen --- tensorrt_llm/_torch/attention_backend/trtllm.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index 6e14627d85..0623be8e65 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -604,6 +604,9 @@ class TrtllmAttentionWrapper: is_mla_enable, ) + def is_sm_version_trtllm_gen_kernel(self, sm): + return not (sm < 100 or sm in [120, 121]) + @dataclass(kw_only=True) class TrtllmAttentionMetadata(AttentionMetadata): From 75f5446d67bba1996b802b72624e2d7fadd68c9d Mon Sep 17 00:00:00 2001 From: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:24:27 -0800 Subject: [PATCH 020/172] [#9753][feat] AutoDeploy: Implement add rms_norm fusion (#9754) Signed-off-by: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com> --- .../_torch/auto_deploy/config/default.yaml | 6 +- .../flashinfer_fused_add_rms_norm.py | 54 +++++++++++ .../transform/library/fused_add_rms_norm.py | 89 +++++++++++++++++++ .../examples/test_ad_speculative_decoding.py | 3 + .../test_flashinfer_fused_add_rms_norm_op.py | 47 ++++++++++ .../library/test_fused_add_rms_norm.py | 76 ++++++++++++++++ 6 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py create mode 100644 tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py create mode 100644 tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py create mode 100644 tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml index 4edf3de150..93a8a540cf 100644 --- a/tensorrt_llm/_torch/auto_deploy/config/default.yaml +++ b/tensorrt_llm/_torch/auto_deploy/config/default.yaml @@ -128,10 +128,12 @@ transforms: # TODO (lucaslie): add backend selection as part of configurable inference optimizers fuse_rmsnorm: stage: post_load_fusion - rmsnorm_backend: triton + rmsnorm_backend: flashinfer gated_rmsnorm_backend: triton requires_shape_prop: true - + fuse_add_rms_norm: + stage: post_load_fusion + enabled: true ############################################################################################ # VISUALIZE GRAPH ############################################################################################ diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py new file mode 100644 index 0000000000..d7a183ce90 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flashinfer +import torch + +from ...flashinfer_utils import get_env_enable_pdl + + +@torch.library.custom_op( + "auto_deploy::flashinfer_fused_add_rms_norm_inplace", mutates_args={"x", "residual"} +) +def flashinfer_fused_add_rms_norm_inplace( + x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float +) -> None: + """ + Fused Add + RMSNorm operation using FlashInfer (In-place). + Computes in-place: + residual = x + residual (sum) + x = rms_norm(residual, weight, eps) (normalized) + + Returns None. + """ + # FlashInfer expects 2D inputs (batch*seq_len, hidden_size) + x_shape = x.shape + residual_shape = residual.shape + x_flat = x.view(-1, x.shape[-1]) + residual_flat = residual.view(-1, residual.shape[-1]) + + flashinfer.norm.fused_add_rmsnorm( + x_flat, residual_flat, weight, eps, enable_pdl=get_env_enable_pdl() + ) + x_flat.view(x_shape) + residual_flat.view(residual_shape) + return + + +@flashinfer_fused_add_rms_norm_inplace.register_fake +def _(x, residual, weight, eps): + return + + +def flashinfer_fused_add_rms_norm(x, residual, weight, eps): + """Wrapper that calls the in-place op and returns the modified tensors.""" + torch.ops.auto_deploy.flashinfer_fused_add_rms_norm_inplace(x, residual, weight, eps) + return x, residual diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py new file mode 100644 index 0000000000..d0bfeee09b --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py @@ -0,0 +1,89 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Transformation for fusing Add + Cast + RMSNorm.""" + +from typing import Tuple + +import torch +from torch.fx import GraphModule + +from ...custom_ops.flashinfer_fused_add_rms_norm import flashinfer_fused_add_rms_norm +from ...models.factory import ModelFactory +from ...shim.interface import CachedSequenceInterface +from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern +from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry + + +@TransformRegistry.register("fuse_add_rms_norm") +class FuseAddRMSNorm(BaseTransform): + """Fuse (add + cast + RMSNorm) into one fused op. + + Matches: + x = add(input, residual) + y = x.to(dtype) + z = flashinfer_rms_norm(y, weight, eps) + + Replaces with: + z, x = flashinfer_fused_add_rms_norm(input, residual, weight, eps) + """ + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + patterns = ADPatternMatcherPass() + + # Dummy shapes for tracing + bsz, hidden = 2, 128 + dummy_args = [ + torch.randn(bsz, hidden, device="meta", dtype=torch.bfloat16), # x (bf16) + torch.randn(bsz, hidden, device="meta", dtype=torch.bfloat16), # residual (bf16) + torch.randn(hidden, device="meta", dtype=torch.bfloat16), # weight + 1e-5, # eps + ] + + op_ignore_types = {torch.ops.aten.to.dtype: (torch.dtype,)} + scalar_workaround = {"eps": 1e-5} + + def _fused_add_norm_pattern(x, residual, weight, eps): + added = torch.ops.aten.add.Tensor(x, residual) + cast = torch.ops.aten.to.dtype(added, torch.bfloat16) + # Note: we assume flashinfer_rms_norm is the target + norm = torch.ops.auto_deploy.flashinfer_rms_norm.default(cast, weight, eps) + return norm, added + + def _fused_add_norm_replacement(x, residual, weight, eps): + # Use the python wrapper directly, not via torch.ops.auto_deploy + return flashinfer_fused_add_rms_norm(x, residual, weight, eps) + + # Register pattern + register_ad_pattern( + search_fn=_fused_add_norm_pattern, + replace_fn=_fused_add_norm_replacement, + patterns=patterns, + dummy_args=dummy_args, + op_ignore_types=op_ignore_types, + scalar_workaround=scalar_workaround, + ) + + num_matches = patterns.apply(gm.graph) + + info = TransformInfo( + skipped=False, + num_matches=num_matches, + is_clean=num_matches == 0, + has_valid_shapes=num_matches == 0, + ) + return gm, info diff --git a/tests/integration/defs/examples/test_ad_speculative_decoding.py b/tests/integration/defs/examples/test_ad_speculative_decoding.py index 34552f967b..1c328863ac 100644 --- a/tests/integration/defs/examples/test_ad_speculative_decoding.py +++ b/tests/integration/defs/examples/test_ad_speculative_decoding.py @@ -81,6 +81,9 @@ def run_with_autodeploy(model, speculative_model_dir, batch_size): "world_size": 1, "kv_cache_config": kv_cache_config, "disable_overlap_scheduler": True, + "transforms": { + "fuse_rmsnorm": {"rmsnorm_backend": "triton"}, + }, "max_num_tokens": 64, } diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py new file mode 100644 index 0000000000..f6d67afb04 --- /dev/null +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py @@ -0,0 +1,47 @@ +import pytest +import torch + +from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_fused_add_rms_norm import ( + flashinfer_fused_add_rms_norm, +) + + +def rms_norm_ref(x, weight, eps): + """Reference implementation of RMSNorm using PyTorch ops.""" + input_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(variance + eps) + return weight * x.to(input_dtype) + + +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("hidden_size", [128, 1024]) +def test_flashinfer_fused_add_rms_norm_kernel(dtype, hidden_size): + bsz = 4 + seq_len = 128 + eps = 1e-6 + + # Create inputs + x = torch.randn(bsz, seq_len, hidden_size, device="cuda", dtype=dtype) + residual = torch.randn_like(x) + weight = torch.randn(hidden_size, device="cuda", dtype=dtype) + + # Clone for reference + x_ref = x.clone() + residual_ref = residual.clone() + + residual_ref_out = x_ref + residual_ref + x_ref_out = rms_norm_ref(residual_ref_out, weight, eps) + + # Run kernel (Our fused op) + x_out, residual_out = flashinfer_fused_add_rms_norm(x, residual, weight, eps) + + rtol, atol = (1e-2, 1e-2) + + torch.testing.assert_close(residual_out, residual_ref_out, rtol=rtol, atol=atol) + torch.testing.assert_close(x_out, x_ref_out, rtol=rtol, atol=atol) + + # Verify in-place modification happened + assert x is x_out + assert residual is residual_out diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py new file mode 100644 index 0000000000..8cfb59756a --- /dev/null +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py @@ -0,0 +1,76 @@ +import torch +from torch.export import Dim + +from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_fused_add_rms_norm import * # noqa +from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import * # noqa +from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer +from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op + + +class TestModel(torch.nn.Module): + def __init__(self, hidden_size=128, eps=1e-5): + super().__init__() + self.weight = torch.nn.Parameter( + torch.ones(hidden_size, device="cuda", dtype=torch.bfloat16) + ) + self.eps = eps + + def forward(self, x, residual): + added = x + residual + cast = added.to(torch.bfloat16) + norm = torch.ops.auto_deploy.flashinfer_rms_norm(cast, self.weight, self.eps) + return norm, added + + +def _run_test(model): + # The replacement uses flashinfer_fused_add_rms_norm python wrapper which calls the inplace op + # auto_deploy::flashinfer_fused_add_rms_norm_inplace + op = torch.ops.auto_deploy.flashinfer_fused_add_rms_norm_inplace + + def checker(gm): + return any(is_op(n, op) for n in gm.graph.nodes) + + bsz, seq_len, hidden = 2, 8, 128 + # Inputs should be bfloat16 + x = torch.randn(bsz, seq_len, hidden, device="cuda", dtype=torch.bfloat16) + residual = torch.randn(bsz, seq_len, hidden, device="cuda", dtype=torch.bfloat16) + + # Dynamic shapes + ds_x = {0: Dim("batch_size", max=8)} + ds_res = {0: Dim("batch_size", max=8)} + + gm = torch_export_to_gm(model, args=(x, residual), dynamic_shapes=(ds_x, ds_res), clone=True) + + gm_transformed = InferenceOptimizer( + None, + { + "fuse_add_rms_norm": { + "stage": "post_load_fusion", + }, + }, + )(None, gm) + + # Check if transform happened + if not checker(gm_transformed): + raise AssertionError( + "flashinfer_fused_add_rms_norm_inplace op not found in transformed graph" + ) + + # Validation + # Clone inputs because the fused op is inplace + x_in = x.clone() + res_in = residual.clone() + + # The fused op is inplace, so inputs x_in and res_in will be modified. + # gm_transformed returns (x_in, res_in) which are the modified tensors. + y_transformed = gm_transformed(x_in, res_in) + + y_model = model(x.clone(), residual.clone()) + torch.testing.assert_close(y_transformed[0], y_model[0], atol=1e-2, rtol=1e-2) + torch.testing.assert_close(y_transformed[1], y_model[1], atol=1e-2, rtol=1e-2) + + +def test_fuse_add_rms_norm(): + model = TestModel() + _run_test(model) From 390391ebf1f11f6eb490c7a7e3ff6f74db64b95a Mon Sep 17 00:00:00 2001 From: yuanjingx87 <197832395+yuanjingx87@users.noreply.github.com> Date: Mon, 8 Dec 2025 17:48:21 -0800 Subject: [PATCH 021/172] [None][infra] Correct the waived test names due to a merge conflict (#9803) Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index f89284e126..0deb0676d3 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -440,4 +440,5 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5722653) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5722653) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5722653) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) From 1c4dacb19a52b1b68079bea1960c8160d149cca3 Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Mon, 8 Dec 2025 21:16:29 -0500 Subject: [PATCH 022/172] [None][fix] Fix PDL in TRTLLM MOE for dsv3 (#9799) Signed-off-by: Tri Dao --- .../kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu | 6 +++--- .../kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu | 4 ++-- cpp/tensorrt_llm/kernels/noAuxTcKernels.cu | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu index 2139682dd9..1480be8140 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu @@ -296,7 +296,7 @@ public: __device__ void issue_mainloop() { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #pragma unroll 1 for (int loop_idx = 0; loop_idx < k_iter_cnt; loop_idx++) { @@ -601,8 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel( } } __syncthreads(); - asm volatile("griddepcontrol.wait;"); - asm volatile("griddepcontrol.launch_dependents;"); + cudaGridDependencySynchronize(); + cudaTriggerProgrammaticLaunchCompletion(); if (warp_idx < 2) { diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu index 08659b6c83..34557cc490 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu @@ -74,7 +74,7 @@ __global__ __launch_bounds__(128, 1) void router_gemm_kernel(float* out, T const } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Process the GEMM in chunks @@ -167,7 +167,7 @@ __global__ __launch_bounds__(128, 1) void router_gemm_kernel(float* out, T const } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu index f21c8c6235..b132a54b5f 100644 --- a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu +++ b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu @@ -47,7 +47,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx int64_t const numExperts, int64_t const numExpertsPerGroup, double const routedScalingFactor) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // declare shared memory structure @@ -254,7 +254,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } From d6f961d3fe1de4853ce99c91e2c69d205e35698c Mon Sep 17 00:00:00 2001 From: bhsueh_NV <11360707+byshiue@users.noreply.github.com> Date: Tue, 9 Dec 2025 10:27:39 +0800 Subject: [PATCH 023/172] [None][feat] Add llama4 scaling (#9771) Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- tensorrt_llm/_torch/modules/attention.py | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index ed23eb7aab..383ebf8296 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -985,6 +985,14 @@ class MLA(nn.Module): is_neox=pos_embd_params.is_neox, ) + self.llama_4_scaling = False + if hasattr(config.pretrained_config, 'llama_4_scaling'): + self.llama_4_scaling = True + self.floor_scale = getattr(config.pretrained_config.llama_4_scaling, + 'original_max_position_embeddings', 8192) + self.attn_scale = getattr(config.pretrained_config.llama_4_scaling, + 'beta', 0.1) + if not config.skip_create_weights_in_init: self.create_weights() @@ -1127,6 +1135,18 @@ class MLA(nn.Module): return hidden_states.new_empty([num_tokens, hidden_size], dtype=hidden_states.dtype) + def _attention_scaling(self, q, position_ids): + + def _get_attn_scale(position_ids: torch.Tensor) -> torch.Tensor: + positions = position_ids.view(-1) + floor = torch.floor((positions + 1.0) / self.floor_scale) + attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0 + return attn_scale.unsqueeze(-1) + + attn_scale = _get_attn_scale(position_ids) + q = (q * attn_scale).to(q.dtype) + return q + def forward_impl(self, position_ids: Optional[torch.Tensor], hidden_states: torch.Tensor, @@ -1197,6 +1217,10 @@ class MLA(nn.Module): assert position_ids is not None k_pe_ctx = self.apply_rope(q_ctx, k_pe_ctx, position_ids) + if self.llama_4_scaling: + q_ctx = self._attention_scaling( + q_ctx, position_ids[..., :num_ctx_tokens]) + self.forward_context( q_ctx, compressed_kv_ctx, @@ -1217,6 +1241,10 @@ class MLA(nn.Module): assert position_ids is not None k_pe_gen = self.apply_rope(q_gen, k_pe_gen, position_ids) + if self.llama_4_scaling: + q_gen = self._attention_scaling( + q_gen, position_ids[..., num_ctx_tokens:]) + self.forward_absorption_generation( q_gen, compressed_kv_gen, From 4a3a66b1249890005734a9b052c576a667002986 Mon Sep 17 00:00:00 2001 From: Jiagan Cheng Date: Tue, 9 Dec 2025 10:43:52 +0800 Subject: [PATCH 024/172] [https://nvbugs/5677746][fix] Use first PP rank's schedule result in other PP ranks to fix PP hang (#9659) Signed-off-by: Jiagan Cheng --- .../batch_manager/capacityScheduler.cpp | 6 +- tensorrt_llm/_torch/pyexecutor/py_executor.py | 135 +++++++++++++++--- tensorrt_llm/_torch/pyexecutor/scheduler.py | 70 +++++++++ .../integration/test_lists/test-db/l0_a10.yml | 1 + .../test_scheduler_serializable_output.py | 59 ++++++++ 5 files changed, 248 insertions(+), 23 deletions(-) create mode 100644 tests/unittest/_torch/executor/test_scheduler_serializable_output.py diff --git a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp index 9c9c56ba9d..d765bcf317 100644 --- a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp +++ b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp @@ -247,7 +247,8 @@ std::tuple GuaranteedNoEvictScheduler::impl( { break; } - else if (req->isGenerationInProgressState()) + + if (req->isGenerationInProgressState()) { scheduledRequests.emplace_back(req); reservedBlocks.decrementReservedBlocks(*req); @@ -296,7 +297,8 @@ std::tuple GuaranteedNoEvictScheduler::impl( { break; } - else if (req->isContextInitState() || req->isDisaggGenerationInitState()) + + if (req->isContextInitState() || req->isDisaggGenerationInitState()) { bool enoughBlocks = reservedBlocks.enoughAvailableBlocks(*req); bool enoughCrossBlocks diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 430a4dcd08..3751dff618 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -53,7 +53,8 @@ from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState, from .model_engine import ModelEngine from .resource_manager import ResourceManager from .sampler import Sampler, SampleState, SampleStateTensors -from .scheduler import RequestScheduler, ScheduledRequests +from .scheduler import (RequestScheduler, ScheduledRequests, + SerializableSchedulerOutput) # Environment variable to specify iteration ranges for profiling start/stop. # Format: "start1-stop1,start2-stop2,..." or single iterations "iter1,iter2,..." @@ -65,6 +66,8 @@ PROFILE_TRACE_ENV_VAR_NAME = "TLLM_TORCH_PROFILE_TRACE" # Unique tag base to avoid collisions with token/logits comms TERMINATION_COMM_TAG_BASE = 20000 +PP_COMM_TAG_SCHEDULE_RESULT = 21000 +PP_COMM_TAG_SAMPLE_STATE_BASE = 21001 @functools.cache @@ -232,6 +235,10 @@ class PyExecutor: self.micro_batches: List[BatchStatePP | None] = [None] * self.num_micro_batches self.send_handles = [None] * self.num_micro_batches + # schedule handle for PP to propagate the first PP rank's schedule result + self.send_schedule_handler = None + self.pp_scheduler_max_retry_count = int( + os.environ.get("TLLM_PP_SCHEDULER_MAX_RETRY_COUNT", 10)) # Set of request IDs that are currently in flight across all micro batches. # The scheduler will avoid scheduling requests that are already in flight. @@ -786,6 +793,77 @@ class PyExecutor: self.response_cv.notify_all() self.shutdown_event.set() + def _pp_schedule_and_propagate(self): + """The first PP rank schedules the requests and propagates the result to all other PP ranks.""" + + # The first PP rank schedules the requests, other ranks receive the schedule result from the previous PP rank. + if self.dist.is_first_pp_rank: + scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule( + ) + serializable_schedule = SerializableSchedulerOutput.from_scheduler_result( + scheduled_batch, fitting_disagg_gen_init_requests, + num_fitting_reqs) + else: + with nvtx_range("recv_schedule_from_prev_pp"): + serializable_schedule = self.dist.recv_object( + self.dist.prev_pp_rank, PP_COMM_TAG_SCHEDULE_RESULT) + scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = serializable_schedule.to_scheduler_result( + self.active_requests) + + # Propagate the schedule result to the next PP rank except the last PP rank. + if not self.dist.is_last_pp_rank: + if self.send_schedule_handler is not None: + with nvtx_range("wait_send_schedule_handler"): + self.send_schedule_handler.wait() + with nvtx_range("send_schedule_to_next_pp"): + self.send_schedule_handler = self.dist.isend_object( + serializable_schedule, self.dist.next_pp_rank, + PP_COMM_TAG_SCHEDULE_RESULT) + return scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs + + def _pp_retry_until_can_schedule(self, scheduled_batch): + """ + If current rank cannot run the scheduled batch, it will retry following steps until it has enough KV cache resources or reach maximum retry count: + 1. Wait for cache transceiver to finish at least one cache transmission. + 2. Terminate requests that have finished context cache transmission. + 3. Check if current rank has enough KV cache resources to run the scheduled batch. + """ + scheduled_batch_requests = scheduled_batch.all_requests() + if self.scheduler.can_schedule(scheduled_batch_requests): + return + + logger.warning( + "Cannot run first PP's schedule result due to limited KV cache resources. This may cause bubbles in the PP pipeline. Please consider increasing the KV cache size by setting `free_gpu_memory_fraction` to a larger value." + ) + if self.kv_cache_transceiver is None: + raise RuntimeError( + "KV cache transceiver is not enabled, but current rank cannot run first PP's schedule result due to limited KV cache resources. This is not expected." + ) + if not self.ctx_in_transmission_requests: + raise RuntimeError( + "No context cache transmission is in progress, but current rank cannot run first PP's schedule result due to limited KV cache resources. This is not expected." + ) + if self.block_reuse_enabled and self._disagg_pp_termination_handler is not None: + raise RuntimeError( + "Cannot terminate requests in cache transmission and release their KV cache resources when block reuse is enabled. Please consider increasing the KV cache size." + ) + + for retry_count in range(self.pp_scheduler_max_retry_count): + if self.scheduler.can_schedule(scheduled_batch_requests): + break + logger.debug( + f"Retrying to run first PP's schedule result ({retry_count + 1}/{self.pp_scheduler_max_retry_count})" + ) + + # Let cache transceiver finish at least one cache transmission and release requests' KV cache resources + self._check_disagg_ctx_cache_transfer_status(1) + self._check_kv_transfer_timeout() + self._terminate_disagg_ctx_finished_requests() + else: + raise RuntimeError( + f"Reach maximum PP retry count ({self.pp_scheduler_max_retry_count}) but still cannot run first PP's schedule result. Please consider increasing the KV cache size by setting `free_gpu_memory_fraction` to a larger value. Or you can set `TLLM_PP_SCHEDULER_MAX_RETRY_COUNT` to a larger value to allow more retries." + ) + def _executor_loop_pp(self): logger.debug(f"Starting executor loop for pp_rank {self.dist.pp_rank}") torch.cuda.set_device(self.device_id) @@ -799,6 +877,8 @@ class PyExecutor: profile_step() if self.enable_iter_perf_stats: iter_start_time = time.time() + + # Fetch new requests from request queue new_requests = self._fetch_and_activate_new_requests() if self.should_stop_processing: break @@ -816,11 +896,18 @@ class PyExecutor: self._pad_attention_dp_dummy_request() - scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule( + # Stage 0: first PP rank schedules requests and propagates the result to all other PP ranks. + scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._pp_schedule_and_propagate( ) + if not self.dist.is_first_pp_rank: + # Retry until current rank can run first PP's schedule result. + self._pp_retry_until_can_schedule(scheduled_batch) + # Run scheduler locally because scheduler may change llm requests' state. + self.scheduler.schedule_request(self.active_requests, + self.inflight_req_ids) + # For requests that are fitting disagg gen init, also prepare resources for KV cache manager if self.kv_cache_transceiver: - # For requests that are fitting disagg gen init, also prepare resources for KV cache manager self._prepare_disagg_gen_init( fitting_disagg_gen_init_requests) @@ -840,7 +927,6 @@ class PyExecutor: ) can_queue = self._can_queue(scheduled_batch) - if not can_queue: logger.debug( f"microbatch {microbatch_id} cannot be queued, skipping" @@ -928,6 +1014,7 @@ class PyExecutor: prev_microbatch_id = (microbatch_id + offset) % self.num_micro_batches previous_batch = self.micro_batches[prev_microbatch_id] + tag = PP_COMM_TAG_SAMPLE_STATE_BASE + prev_microbatch_id if previous_batch is not None: sample_state = previous_batch.sample_state if not self.dist.is_last_pp_rank: @@ -937,7 +1024,7 @@ class PyExecutor: with nvtx_range("recv_sample_state"): sample_state.host = recv_object_funct( src=self.dist.prev_pp_rank, - tag=prev_microbatch_id, + tag=tag, ) # Send tokens to next pp rank (w.r.t model forward direction) @@ -949,7 +1036,7 @@ class PyExecutor: prev_microbatch_id] = self.dist.isend_object( sample_state.host, dest=self.dist.next_pp_rank, - tag=prev_microbatch_id) + tag=tag) # Stage 3: Finalize previous batch that finished sample state communication # In last pp rank, stage 2 and 3 process different previous batches @@ -1746,24 +1833,26 @@ class PyExecutor: def _waiting_requests(self, context_requests: list[LlmRequest], generation_requests: list[LlmRequest]): - if not self.enable_batch_waiting: - return context_requests + """ + Return an empty list if scheduled requests fulfill the waiting conditions, otherwise return the original context requests. + Waiting conditions: + - The number of scheduled tokens (both context and generation) is smaller than `self.batch_wait_max_tokens_ratio * self.max_num_tokens` + - The number of waiting iterations is smaller than `self.batch_wait_timeout_iters`. + """ - waited_context_requests = [] - stop_waiting = False num_scheduled_ctx_tokens = sum( len(ctx_req.get_tokens(0)) for ctx_req in context_requests) num_scheduled_gen_tokens = sum(1 + gen_req.num_draft_tokens for gen_req in generation_requests) num_scheduled_tokens = num_scheduled_ctx_tokens + num_scheduled_gen_tokens - stop_waiting = self.batch_wait_iters_count >= self.batch_wait_timeout_iters or num_scheduled_tokens >= self.batch_wait_max_tokens_ratio * self.max_num_tokens - if stop_waiting: - waited_context_requests = context_requests - self.batch_wait_iters_count = 0 - else: + should_waiting = self.batch_wait_iters_count < self.batch_wait_timeout_iters and num_scheduled_tokens < self.batch_wait_max_tokens_ratio * self.max_num_tokens + if should_waiting: self.batch_wait_iters_count += 1 - return waited_context_requests + return [] + + self.batch_wait_iters_count = 0 + return context_requests @nvtx_range("_schedule") def _schedule(self): @@ -1775,10 +1864,11 @@ class PyExecutor: scheduler_output.context_requests, scheduler_output.generation_requests) - # if no generation requests, no need to wait, to avoid dead waiting - if not self.enable_attention_dp and self.enable_batch_waiting and len( - scheduler_output.context_requests) > 0 and len( - scheduler_output.generation_requests) > 0: + # If no generation requests, no need to wait, to avoid dead waiting + should_check_waiting = not self.enable_attention_dp and self.enable_batch_waiting and len( + scheduler_output.context_requests) > 0 and len( + scheduler_output.generation_requests) > 0 + if should_check_waiting: scheduled_context_requests = self._waiting_requests( scheduler_output.context_requests, scheduler_output.generation_requests) @@ -2408,7 +2498,10 @@ class PyExecutor: @nvtx_range("_terminate_disagg_ctx_finished_requests") def _terminate_disagg_ctx_finished_requests(self): - for request_id in list(self.ctx_in_transmission_requests.keys()): + # make a copy of the keys, since we are modifying the dictionary in the loop + in_transmission_requests_id = list( + self.ctx_in_transmission_requests.keys()) + for request_id in in_transmission_requests_id: request, block_id, counter = self.ctx_in_transmission_requests[ request_id] diff --git a/tensorrt_llm/_torch/pyexecutor/scheduler.py b/tensorrt_llm/_torch/pyexecutor/scheduler.py index c71c4596ed..2c1d8f916f 100644 --- a/tensorrt_llm/_torch/pyexecutor/scheduler.py +++ b/tensorrt_llm/_torch/pyexecutor/scheduler.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from collections import namedtuple +from dataclasses import dataclass from typing import Optional, Tuple from strenum import StrEnum @@ -54,6 +55,70 @@ class RequestScheduler(ABC): # to be aligned with RequestScheduler::scheduleRequests in cpp/tensorrt_llm/batch_manager/requestScheduler.h raise NotImplementedError + @abstractmethod + def can_schedule(self, requests: RequestList) -> bool: + """ + Check if current rank can schedule the requests. + :param requests: list of requests to be scheduled + :return: True if current rank can schedule the requests, False otherwise + """ + raise NotImplementedError + + +@dataclass +class SerializableSchedulerOutput: + """ + Serializable version of SchedulerOutput, used for sending schedule result to other ranks. Need this class because LlmRequest is not serializable by pickle. + """ + context_requests: list[int] # request ids of context requests + generation_requests: list[int] # request ids of generation requests + paused_requests: list[int] # request ids of paused requests + fitting_disagg_gen_init_requests: list[ + int] # request ids of fitting disaggregated generation initialization requests + num_fitting_requests: int # number of fitting requests + + @classmethod + def from_scheduler_result( + cls, scheduled_requests: ScheduledRequests, + fitting_disagg_gen_init_requests: RequestList, + num_fitting_requests: int) -> "SerializableSchedulerOutput": + return cls(context_requests=[ + req.request_id for req in scheduled_requests.context_requests + ], + generation_requests=[ + req.request_id + for req in scheduled_requests.generation_requests + ], + paused_requests=[ + req.request_id + for req in scheduled_requests.paused_requests + ], + fitting_disagg_gen_init_requests=[ + req.request_id + for req in fitting_disagg_gen_init_requests + ], + num_fitting_requests=num_fitting_requests) + + def to_scheduler_result( + self, active_requests: RequestList + ) -> Tuple[ScheduledRequests, RequestList, int]: + id_to_request = {req.request_id: req for req in active_requests} + scheduled_requests = ScheduledRequests() + scheduled_requests.context_requests = [ + id_to_request[req_id] for req_id in self.context_requests + ] + scheduled_requests.generation_requests = [ + id_to_request[req_id] for req_id in self.generation_requests + ] + scheduled_requests.paused_requests = [ + id_to_request[req_id] for req_id in self.paused_requests + ] + fitting_disagg_gen_init_requests = [ + id_to_request[req_id] + for req_id in self.fitting_disagg_gen_init_requests + ] + return scheduled_requests, fitting_disagg_gen_init_requests, self.num_fitting_requests + class CapacityScheduler(ABC): @@ -216,3 +281,8 @@ class SimpleScheduler(RequestScheduler): list(generation_requests), list(paused_requests), list(fitting_disagg_gen_init_requests), len(fitting_requests)) + + def can_schedule(self, requests: RequestList) -> bool: + fitting_requests, _, _ = self.capacity_scheduler.schedule_request( + requests) + return len(fitting_requests) == len(requests) diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 7eb00943f6..36a5bc32e5 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -20,6 +20,7 @@ l0_a10: - unittest/_torch/modeling/test_modeling_mistral.py - unittest/_torch/modeling/test_modeling_pixtral.py - unittest/_torch/sampler/test_trtllm_sampler.py + - unittest/_torch/executor/test_scheduler_serializable_output.py # NOTE: this is a CPU-only test, but we do not have a dedicated job for this (and therefore no # test list either). - unittest/_torch/models/checkpoints/hf/test_weight_loader.py diff --git a/tests/unittest/_torch/executor/test_scheduler_serializable_output.py b/tests/unittest/_torch/executor/test_scheduler_serializable_output.py new file mode 100644 index 0000000000..94fba12d7d --- /dev/null +++ b/tests/unittest/_torch/executor/test_scheduler_serializable_output.py @@ -0,0 +1,59 @@ +import pickle + +from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest, SamplingConfig +from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests, SerializableSchedulerOutput + + +def _make_request(request_id: int) -> LlmRequest: + return LlmRequest( + request_id=request_id, + max_new_tokens=5, + input_tokens=[request_id], + sampling_config=SamplingConfig(), + is_streaming=False, + ) + + +def _request_ids(requests): + return [req.request_id for req in requests] + + +def test_serializable_scheduler_output_round_trip(): + # Create all requests and put them in a pool + request_pool = {idx: _make_request(idx) for idx in range(1, 8)} + + # Create scheduler result: scheduled_requests, fitting_disagg_gen_init_requests, num_fitting_requests + scheduled_requests = ScheduledRequests() + scheduled_requests.context_requests = [request_pool[1], request_pool[2]] + scheduled_requests.generation_requests = [request_pool[3]] + scheduled_requests.paused_requests = [request_pool[4]] + fitting_disagg_gen_init_requests = [request_pool[5], request_pool[6]] + num_fitting_requests = 3 + + # Create serializable scheduler output from scheduler result + serializable_output = SerializableSchedulerOutput.from_scheduler_result( + scheduled_requests, fitting_disagg_gen_init_requests, num_fitting_requests + ) + + # Serialize and deserialize the serializable scheduler output + serialized_bytes = pickle.dumps(serializable_output) + restored_output: SerializableSchedulerOutput = pickle.loads(serialized_bytes) + + # Restore the scheduler result from the deserialized serializable scheduler output + active_requests = list(request_pool.values()) + restored_schedule, restored_fitting, restored_num_fitting = restored_output.to_scheduler_result( + active_requests + ) + + # Verify the restored scheduler result is correct + assert restored_num_fitting == num_fitting_requests + assert _request_ids(restored_schedule.context_requests) == _request_ids( + scheduled_requests.context_requests + ) + assert _request_ids(restored_schedule.generation_requests) == _request_ids( + scheduled_requests.generation_requests + ) + assert _request_ids(restored_schedule.paused_requests) == _request_ids( + scheduled_requests.paused_requests + ) + assert _request_ids(restored_fitting) == _request_ids(fitting_disagg_gen_init_requests) From f521f6d9100469ee73f4ee27e92bce7ceb9567f0 Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Tue, 9 Dec 2025 11:15:40 +0800 Subject: [PATCH 025/172] [None][fix] Fix unterminated process issue for RemoteOpenAIServer (#9490) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tests/unittest/llmapi/apps/openai_server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unittest/llmapi/apps/openai_server.py b/tests/unittest/llmapi/apps/openai_server.py index b3fde6b94c..15dd94eb47 100644 --- a/tests/unittest/llmapi/apps/openai_server.py +++ b/tests/unittest/llmapi/apps/openai_server.py @@ -97,6 +97,8 @@ class RemoteOpenAIServer: time.sleep(0.5) if time.time() - start > timeout: + # Terminate the server to avoid the process keeping running in background after timeout + self.terminate() raise RuntimeError( "Server failed to start in time.") from err From c7a2568872dcb2cf21739fcdd1273d5b832b241d Mon Sep 17 00:00:00 2001 From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> Date: Tue, 9 Dec 2025 03:19:45 +0000 Subject: [PATCH 026/172] [None][infra] Check in most recent lock file from nightly pipeline Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> --- security_scanning/docs/poetry.lock | 6 +- .../examples/auto_deploy/poetry.lock | 6 +- .../examples/draft_target_model/poetry.lock | 6 +- security_scanning/examples/eagle/poetry.lock | 6 +- .../llm-eval/lm-eval-harness/poetry.lock | 6 +- .../examples/lookahead/poetry.lock | 6 +- security_scanning/examples/medusa/poetry.lock | 6 +- .../models/contrib/baichuan/poetry.lock | 6 +- .../examples/models/contrib/bloom/poetry.lock | 6 +- .../models/contrib/chatglm-6b/poetry.lock | 6 +- .../models/contrib/chatglm2-6b/poetry.lock | 6 +- .../contrib/chatglm3-6b-32k/poetry.lock | 6 +- .../examples/models/contrib/dbrx/poetry.lock | 6 +- .../models/contrib/deepseek_v1/poetry.lock | 6 +- .../models/contrib/deepseek_v2/poetry.lock | 6 +- .../models/contrib/falcon/poetry.lock | 6 +- .../examples/models/contrib/gptj/poetry.lock | 6 +- .../models/contrib/gptneox/poetry.lock | 6 +- .../examples/models/contrib/grok/poetry.lock | 6 +- .../models/contrib/internlm/poetry.lock | 6 +- .../examples/models/contrib/jais/poetry.lock | 6 +- .../examples/models/contrib/mmdit/poetry.lock | 112 ++++++++++++++++-- .../examples/models/contrib/mpt/poetry.lock | 6 +- .../examples/models/contrib/opt/poetry.lock | 6 +- .../models/contrib/skywork/poetry.lock | 6 +- .../examples/models/contrib/smaug/poetry.lock | 6 +- .../examples/models/contrib/stdit/poetry.lock | 6 +- .../examples/models/core/commandr/poetry.lock | 6 +- .../examples/models/core/gemma/poetry.lock | 6 +- .../examples/models/core/glm-4-9b/poetry.lock | 6 +- .../examples/models/core/gpt/poetry.lock | 6 +- .../examples/models/core/llama/poetry.lock | 6 +- .../examples/models/core/mamba/poetry.lock | 6 +- .../examples/models/core/mixtral/poetry.lock | 6 +- .../examples/models/core/mllama/poetry.lock | 6 +- .../examples/models/core/nemotron/poetry.lock | 6 +- .../examples/models/core/phi/poetry.lock | 6 +- .../examples/models/core/qwen/poetry.lock | 6 +- .../models/core/qwen2audio/poetry.lock | 6 +- .../examples/models/core/qwenvl/poetry.lock | 6 +- .../models/core/recurrentgemma/poetry.lock | 6 +- .../examples/models/core/whisper/poetry.lock | 6 +- security_scanning/examples/ngram/poetry.lock | 6 +- .../examples/quantization/poetry.lock | 6 +- .../examples/ray_orchestrator/poetry.lock | 6 +- .../examples/redrafter/poetry.lock | 6 +- .../examples/trtllm-eval/poetry.lock | 6 +- security_scanning/metadata.json | 4 +- security_scanning/poetry.lock | 34 +++--- security_scanning/pyproject.toml | 2 +- security_scanning/triton_backend/poetry.lock | 6 +- 51 files changed, 265 insertions(+), 169 deletions(-) diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock index 6166cc74e7..8e633a2abd 100644 --- a/security_scanning/docs/poetry.lock +++ b/security_scanning/docs/poetry.lock @@ -1195,13 +1195,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/auto_deploy/poetry.lock b/security_scanning/examples/auto_deploy/poetry.lock index 34b7c63f1b..f41cf9682a 100644 --- a/security_scanning/examples/auto_deploy/poetry.lock +++ b/security_scanning/examples/auto_deploy/poetry.lock @@ -3624,13 +3624,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/draft_target_model/poetry.lock b/security_scanning/examples/draft_target_model/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/draft_target_model/poetry.lock +++ b/security_scanning/examples/draft_target_model/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/eagle/poetry.lock b/security_scanning/examples/eagle/poetry.lock index 9968a7af4d..d885d2a20e 100644 --- a/security_scanning/examples/eagle/poetry.lock +++ b/security_scanning/examples/eagle/poetry.lock @@ -1807,13 +1807,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock b/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock index 21af9af3d7..3463f2a104 100644 --- a/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock +++ b/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock @@ -3262,13 +3262,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/lookahead/poetry.lock b/security_scanning/examples/lookahead/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/lookahead/poetry.lock +++ b/security_scanning/examples/lookahead/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/medusa/poetry.lock b/security_scanning/examples/medusa/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/medusa/poetry.lock +++ b/security_scanning/examples/medusa/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/baichuan/poetry.lock b/security_scanning/examples/models/contrib/baichuan/poetry.lock index 7372454415..4f6a876470 100644 --- a/security_scanning/examples/models/contrib/baichuan/poetry.lock +++ b/security_scanning/examples/models/contrib/baichuan/poetry.lock @@ -1998,13 +1998,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/bloom/poetry.lock b/security_scanning/examples/models/contrib/bloom/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/models/contrib/bloom/poetry.lock +++ b/security_scanning/examples/models/contrib/bloom/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock index 80e0ffbd0b..e982d71c4b 100644 --- a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock index 80e0ffbd0b..e982d71c4b 100644 --- a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock index 80e0ffbd0b..e982d71c4b 100644 --- a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/dbrx/poetry.lock b/security_scanning/examples/models/contrib/dbrx/poetry.lock index fba379e942..34f97eabd5 100644 --- a/security_scanning/examples/models/contrib/dbrx/poetry.lock +++ b/security_scanning/examples/models/contrib/dbrx/poetry.lock @@ -1805,13 +1805,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock index c21ae3e779..6305bc2199 100644 --- a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/falcon/poetry.lock b/security_scanning/examples/models/contrib/falcon/poetry.lock index 7c2d170ec2..6263aae157 100644 --- a/security_scanning/examples/models/contrib/falcon/poetry.lock +++ b/security_scanning/examples/models/contrib/falcon/poetry.lock @@ -1865,13 +1865,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/gptj/poetry.lock b/security_scanning/examples/models/contrib/gptj/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/contrib/gptj/poetry.lock +++ b/security_scanning/examples/models/contrib/gptj/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/gptneox/poetry.lock b/security_scanning/examples/models/contrib/gptneox/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/contrib/gptneox/poetry.lock +++ b/security_scanning/examples/models/contrib/gptneox/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/grok/poetry.lock b/security_scanning/examples/models/contrib/grok/poetry.lock index f6761e0abb..2f119a2247 100644 --- a/security_scanning/examples/models/contrib/grok/poetry.lock +++ b/security_scanning/examples/models/contrib/grok/poetry.lock @@ -2718,13 +2718,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/internlm/poetry.lock b/security_scanning/examples/models/contrib/internlm/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/models/contrib/internlm/poetry.lock +++ b/security_scanning/examples/models/contrib/internlm/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/jais/poetry.lock b/security_scanning/examples/models/contrib/jais/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/models/contrib/jais/poetry.lock +++ b/security_scanning/examples/models/contrib/jais/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/mmdit/poetry.lock b/security_scanning/examples/models/contrib/mmdit/poetry.lock index 6a6c623fb4..70f96cbc56 100644 --- a/security_scanning/examples/models/contrib/mmdit/poetry.lock +++ b/security_scanning/examples/models/contrib/mmdit/poetry.lock @@ -1,5 +1,24 @@ # This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +[[package]] +name = "anyio" +version = "4.12.0" +description = "High-level concurrency and networking framework on top of asyncio or Trio" +optional = false +python-versions = ">=3.9" +files = [ + {file = "anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb"}, + {file = "anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} + +[package.extras] +trio = ["trio (>=0.31.0)", "trio (>=0.32.0)"] + [[package]] name = "certifi" version = "2025.11.12" @@ -146,18 +165,19 @@ files = [ [[package]] name = "diffusers" -version = "0.35.2" +version = "0.36.0" description = "State-of-the-art diffusion in PyTorch and JAX." optional = false python-versions = ">=3.8.0" files = [ - {file = "diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5"}, - {file = "diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded"}, + {file = "diffusers-0.36.0-py3-none-any.whl", hash = "sha256:525d42abc74bfc3b2db594999961295c054b48ef40a11724dacf50e6abd1af98"}, + {file = "diffusers-0.36.0.tar.gz", hash = "sha256:a9cde8721b415bde6a678f2d02abb85396487e1b0e0d2b4abb462d14a9825ab0"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=0.34.0" +httpx = "<1.0.0" +huggingface-hub = ">=0.34.0,<2.0" importlib_metadata = "*" numpy = "*" Pillow = "*" @@ -167,16 +187,34 @@ safetensors = ">=0.3.1" [package.extras] bitsandbytes = ["accelerate (>=0.31.0)", "bitsandbytes (>=0.43.3)"] -dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] +dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] docs = ["hf-doc-builder (>=0.3.0)"] flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"] gguf = ["accelerate (>=0.31.0)", "gguf (>=0.10.0)"] +nvidia-modelopt = ["nvidia_modelopt[hf] (>=0.33.1)"] optimum-quanto = ["accelerate (>=0.31.0)", "optimum_quanto (>=0.2.6)"] quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.9.10)", "urllib3 (<=2.0.0)"] test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "phonemizer", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchvision", "transformers (>=4.41.2)"] torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"] torchao = ["accelerate (>=0.31.0)", "torchao (>=0.7.0)"] -training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard"] +training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard", "timm"] + +[[package]] +name = "exceptiongroup" +version = "1.3.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, + {file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} + +[package.extras] +test = ["pytest (>=6)"] [[package]] name = "filelock" @@ -228,6 +266,17 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"] tqdm = ["tqdm"] +[[package]] +name = "h11" +version = "0.16.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.8" +files = [ + {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"}, + {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, +] + [[package]] name = "hf-xet" version = "1.2.0" @@ -262,6 +311,51 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "httpcore" +version = "1.0.9" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55"}, + {file = "httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.16" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<1.0)"] + +[[package]] +name = "httpx" +version = "0.28.1" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, + {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0)"] + [[package]] name = "huggingface-hub" version = "0.36.0" @@ -933,13 +1027,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/mpt/poetry.lock b/security_scanning/examples/models/contrib/mpt/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/contrib/mpt/poetry.lock +++ b/security_scanning/examples/models/contrib/mpt/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/opt/poetry.lock b/security_scanning/examples/models/contrib/opt/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/contrib/opt/poetry.lock +++ b/security_scanning/examples/models/contrib/opt/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/skywork/poetry.lock b/security_scanning/examples/models/contrib/skywork/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/models/contrib/skywork/poetry.lock +++ b/security_scanning/examples/models/contrib/skywork/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/smaug/poetry.lock b/security_scanning/examples/models/contrib/smaug/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/models/contrib/smaug/poetry.lock +++ b/security_scanning/examples/models/contrib/smaug/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/stdit/poetry.lock b/security_scanning/examples/models/contrib/stdit/poetry.lock index 1e325bf1a6..48d254621a 100644 --- a/security_scanning/examples/models/contrib/stdit/poetry.lock +++ b/security_scanning/examples/models/contrib/stdit/poetry.lock @@ -2194,13 +2194,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/commandr/poetry.lock b/security_scanning/examples/models/core/commandr/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/core/commandr/poetry.lock +++ b/security_scanning/examples/models/core/commandr/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/gemma/poetry.lock b/security_scanning/examples/models/core/gemma/poetry.lock index 145366904e..afbadc04e8 100644 --- a/security_scanning/examples/models/core/gemma/poetry.lock +++ b/security_scanning/examples/models/core/gemma/poetry.lock @@ -2746,13 +2746,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/glm-4-9b/poetry.lock b/security_scanning/examples/models/core/glm-4-9b/poetry.lock index 80e0ffbd0b..e982d71c4b 100644 --- a/security_scanning/examples/models/core/glm-4-9b/poetry.lock +++ b/security_scanning/examples/models/core/glm-4-9b/poetry.lock @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/gpt/poetry.lock b/security_scanning/examples/models/core/gpt/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/models/core/gpt/poetry.lock +++ b/security_scanning/examples/models/core/gpt/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/llama/poetry.lock b/security_scanning/examples/models/core/llama/poetry.lock index b520d1846b..fe7a7444fe 100644 --- a/security_scanning/examples/models/core/llama/poetry.lock +++ b/security_scanning/examples/models/core/llama/poetry.lock @@ -1865,13 +1865,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/mamba/poetry.lock b/security_scanning/examples/models/core/mamba/poetry.lock index 7c0a67f0a7..d064087a64 100644 --- a/security_scanning/examples/models/core/mamba/poetry.lock +++ b/security_scanning/examples/models/core/mamba/poetry.lock @@ -1865,13 +1865,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/mixtral/poetry.lock b/security_scanning/examples/models/core/mixtral/poetry.lock index 616b62c621..35127a881e 100644 --- a/security_scanning/examples/models/core/mixtral/poetry.lock +++ b/security_scanning/examples/models/core/mixtral/poetry.lock @@ -1304,13 +1304,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock index 11e0ed3ccb..3bd2e8468d 100644 --- a/security_scanning/examples/models/core/mllama/poetry.lock +++ b/security_scanning/examples/models/core/mllama/poetry.lock @@ -1800,13 +1800,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/nemotron/poetry.lock b/security_scanning/examples/models/core/nemotron/poetry.lock index d503bbc216..ad690d14e4 100644 --- a/security_scanning/examples/models/core/nemotron/poetry.lock +++ b/security_scanning/examples/models/core/nemotron/poetry.lock @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/phi/poetry.lock b/security_scanning/examples/models/core/phi/poetry.lock index 6b864141d3..7fc49d12c8 100644 --- a/security_scanning/examples/models/core/phi/poetry.lock +++ b/security_scanning/examples/models/core/phi/poetry.lock @@ -1816,13 +1816,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock index d90ba7a90b..8ece45a078 100644 --- a/security_scanning/examples/models/core/qwen/poetry.lock +++ b/security_scanning/examples/models/core/qwen/poetry.lock @@ -3442,13 +3442,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwen2audio/poetry.lock b/security_scanning/examples/models/core/qwen2audio/poetry.lock index 4702cbfaee..91bc119d78 100644 --- a/security_scanning/examples/models/core/qwen2audio/poetry.lock +++ b/security_scanning/examples/models/core/qwen2audio/poetry.lock @@ -1962,13 +1962,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwenvl/poetry.lock b/security_scanning/examples/models/core/qwenvl/poetry.lock index 668fbcfeea..f4a26d49cc 100644 --- a/security_scanning/examples/models/core/qwenvl/poetry.lock +++ b/security_scanning/examples/models/core/qwenvl/poetry.lock @@ -3065,13 +3065,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/recurrentgemma/poetry.lock b/security_scanning/examples/models/core/recurrentgemma/poetry.lock index 99f114a265..286080432a 100644 --- a/security_scanning/examples/models/core/recurrentgemma/poetry.lock +++ b/security_scanning/examples/models/core/recurrentgemma/poetry.lock @@ -2506,13 +2506,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/whisper/poetry.lock b/security_scanning/examples/models/core/whisper/poetry.lock index f9d9b95965..f13ed38ca8 100644 --- a/security_scanning/examples/models/core/whisper/poetry.lock +++ b/security_scanning/examples/models/core/whisper/poetry.lock @@ -2857,13 +2857,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/ngram/poetry.lock b/security_scanning/examples/ngram/poetry.lock index 3bd3c2724c..e68bc97032 100644 --- a/security_scanning/examples/ngram/poetry.lock +++ b/security_scanning/examples/ngram/poetry.lock @@ -1821,13 +1821,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/quantization/poetry.lock b/security_scanning/examples/quantization/poetry.lock index 8d393c78f9..7490b5d493 100644 --- a/security_scanning/examples/quantization/poetry.lock +++ b/security_scanning/examples/quantization/poetry.lock @@ -1965,13 +1965,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/ray_orchestrator/poetry.lock b/security_scanning/examples/ray_orchestrator/poetry.lock index b5d6f97bc9..9a3fd66660 100644 --- a/security_scanning/examples/ray_orchestrator/poetry.lock +++ b/security_scanning/examples/ray_orchestrator/poetry.lock @@ -1902,13 +1902,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/redrafter/poetry.lock b/security_scanning/examples/redrafter/poetry.lock index 46ade916e3..03e1ded960 100644 --- a/security_scanning/examples/redrafter/poetry.lock +++ b/security_scanning/examples/redrafter/poetry.lock @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/examples/trtllm-eval/poetry.lock b/security_scanning/examples/trtllm-eval/poetry.lock index 7d353fd2ed..150bbc6c69 100644 --- a/security_scanning/examples/trtllm-eval/poetry.lock +++ b/security_scanning/examples/trtllm-eval/poetry.lock @@ -3264,13 +3264,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 0c24542544..9eacde56c2 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "8e27ce7084d9fab1051e88fc945732e59689761b", - "timestamp": "2025-12-08T02:39:23Z" + "commit_hash": "d6f961d3fe1de4853ce99c91e2c69d205e35698c", + "timestamp": "2025-12-09T02:39:28Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index 18ed93657e..18da005341 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -885,18 +885,19 @@ vision = ["Pillow (>=9.4.0)"] [[package]] name = "diffusers" -version = "0.35.2" +version = "0.36.0" description = "State-of-the-art diffusion in PyTorch and JAX." optional = false python-versions = ">=3.8.0" files = [ - {file = "diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5"}, - {file = "diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded"}, + {file = "diffusers-0.36.0-py3-none-any.whl", hash = "sha256:525d42abc74bfc3b2db594999961295c054b48ef40a11724dacf50e6abd1af98"}, + {file = "diffusers-0.36.0.tar.gz", hash = "sha256:a9cde8721b415bde6a678f2d02abb85396487e1b0e0d2b4abb462d14a9825ab0"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=0.34.0" +httpx = "<1.0.0" +huggingface-hub = ">=0.34.0,<2.0" importlib_metadata = "*" numpy = "*" Pillow = "*" @@ -906,16 +907,17 @@ safetensors = ">=0.3.1" [package.extras] bitsandbytes = ["accelerate (>=0.31.0)", "bitsandbytes (>=0.43.3)"] -dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] +dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] docs = ["hf-doc-builder (>=0.3.0)"] flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"] gguf = ["accelerate (>=0.31.0)", "gguf (>=0.10.0)"] +nvidia-modelopt = ["nvidia_modelopt[hf] (>=0.33.1)"] optimum-quanto = ["accelerate (>=0.31.0)", "optimum_quanto (>=0.2.6)"] quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.9.10)", "urllib3 (<=2.0.0)"] test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "phonemizer", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchvision", "transformers (>=4.41.2)"] torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"] torchao = ["accelerate (>=0.31.0)", "torchao (>=0.7.0)"] -training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard"] +training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard", "timm"] [[package]] name = "dill" @@ -2028,13 +2030,13 @@ files = [ [[package]] name = "meson" -version = "1.9.2" +version = "1.10.0" description = "A high performance build system" optional = false python-versions = ">=3.7" files = [ - {file = "meson-1.9.2-py3-none-any.whl", hash = "sha256:1a284dc1912929098a6462401af58dc49ae3f324e94814a38a8f1020cee07cba"}, - {file = "meson-1.9.2.tar.gz", hash = "sha256:3499b59bb23982496e01e57b4103ac2f826f9c3a3f59e507a0a832487fe55e3d"}, + {file = "meson-1.10.0-py3-none-any.whl", hash = "sha256:4b27aafce281e652dcb437b28007457411245d975c48b5db3a797d3e93ae1585"}, + {file = "meson-1.10.0.tar.gz", hash = "sha256:8071860c1f46a75ea34801490fd1c445c9d75147a65508cd3a10366a7006cc1c"}, ] [package.extras] @@ -2781,13 +2783,13 @@ typing-extensions = "*" [[package]] name = "nvidia-ml-py" -version = "13.580.82" +version = "13.590.44" description = "Python Bindings for the NVIDIA Management Library" optional = false python-versions = "*" files = [ - {file = "nvidia_ml_py-13.580.82-py3-none-any.whl", hash = "sha256:4361db337b0c551e2d101936dae2e9a60f957af26818e8c0c3a1f32b8db8d0a7"}, - {file = "nvidia_ml_py-13.580.82.tar.gz", hash = "sha256:0c028805dc53a0e2a6985ea801888197765ac2ef8f1c9e29a7bf0d3616a5efc7"}, + {file = "nvidia_ml_py-13.590.44-py3-none-any.whl", hash = "sha256:18feb54eca7d0e3cdc8d1a040a771eda72d9ec3148e5443087970dbfd7377ecc"}, + {file = "nvidia_ml_py-13.590.44.tar.gz", hash = "sha256:b358c7614b0fdeea4b95f046f1c90123bfe25d148ab93bb1c00248b834703373"}, ] [[package]] @@ -5362,13 +5364,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] @@ -5771,4 +5773,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "d139908ab9a0e12fb01f83db076f4e2154879d6f215b808f864103a4a57aea75" +content-hash = "3ec3d9eabf7664da1722a32823997c383d7eab2dc54fa2c10e67849245300beb" diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index b5478ebabe..562500afc0 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -62,7 +62,7 @@ llguidance = "0.7.29" jsonschema = "^4.25.1" backoff = "^2.2.1" nvtx = "^0.2.14" -meson = "^1.9.2" +meson = "^1.10.0" ninja = "^1.13.0" etcd3 = {git = "https://github.com/kragniz/python-etcd3.git", rev = "e58a899579ba416449c4e225b61f039457c8072a"} blake3 = "^1.0.8" diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock index d924c884a7..2bc400198c 100644 --- a/security_scanning/triton_backend/poetry.lock +++ b/security_scanning/triton_backend/poetry.lock @@ -959,13 +959,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.0" +version = "2.6.1" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.0-py3-none-any.whl", hash = "sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f"}, - {file = "urllib3-2.6.0.tar.gz", hash = "sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1"}, + {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, + {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, ] [package.extras] From f2006a1f743ea694bcc8455e6ee4b5ec83ac3c97 Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Tue, 9 Dec 2025 11:51:43 +0800 Subject: [PATCH 027/172] [https://nvbugs/5726066][infra] Waive timeout disaggregated/test_auto_scaling tests. (#9815) Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 0deb0676d3..238b198aa5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -433,6 +433,8 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughp accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) +disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5726066) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) From bafb60c1bc958dd8e45df22b1990e61a2c218d8c Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Mon, 8 Dec 2025 20:08:52 -0800 Subject: [PATCH 028/172] [None][chore] Fix tests failing on pre-merge 12/08 (#9819) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 238b198aa5..89585ec95e 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -444,3 +444,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5722653) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) From 90890785ebd4b226e62e30226e98395790d6b226 Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Tue, 9 Dec 2025 12:34:55 +0800 Subject: [PATCH 029/172] [https://nvbugs/5722653][fix] Fix config file used by disagg_client (#9783) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> Signed-off-by: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> --- tests/integration/defs/disaggregated/test_disaggregated.py | 5 ++--- tests/integration/test_lists/waives.txt | 4 ---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index bb811de4d1..1c43dd50e2 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -1594,9 +1594,8 @@ def run_disaggregated_benchmark(example_dir, # Ensure the sever has started client_dir = f"{example_dir}/clients" client_cmd = [ - 'python3', f'{client_dir}/disagg_client.py', '-c', - f'{example_dir}/disagg_config.yaml', '-p', - f'{client_dir}/prompts.json', '--ignore-eos', + 'python3', f'{client_dir}/disagg_client.py', '-c', config_file, + '-p', f'{client_dir}/prompts.json', '--ignore-eos', '--server-start-timeout', str(server_start_timeout) ] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 89585ec95e..e143bd843e 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -438,10 +438,6 @@ disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5722653) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5722653) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5722653) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) From b050804b63a4312bc58991aa963f416ce049ad3b Mon Sep 17 00:00:00 2001 From: Shi Xiaowei <39303645+Shixiaowei02@users.noreply.github.com> Date: Tue, 9 Dec 2025 12:54:53 +0800 Subject: [PATCH 030/172] [TRTLLM-6537][infra] extend multi-gpu tests related file list (#9614) Signed-off-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com> --- jenkins/L0_MergeRequest.groovy | 1 + .../integration/defs/accuracy/test_disaggregated_serving.py | 5 ++--- tests/integration/test_lists/waives.txt | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index a08d5b4b23..e3c80bf48c 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -740,6 +740,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars) "tests/unittest/disaggregated/", "tests/unittest/llmapi/test_llm_multi_gpu.py", "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py", + "tests/integration/defs/accuracy/test_disaggregated_serving.py", ] def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 3d7ed84dfd..894114c0f4 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -45,9 +45,8 @@ class Result(GenerationResultBase): DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async']) -# TODO: Change back to 1800 when the disaggregated serving test slowdown issue is resolved. -DEFAULT_TEST_TIMEOUT = 3600 -DEFAULT_SERVER_WAITING_TIMEOUT = 3600 +DEFAULT_TEST_TIMEOUT = 1200 +DEFAULT_SERVER_WAITING_TIMEOUT = 1200 class MyThreadPoolExecutor(ThreadPoolExecutor): diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index e143bd843e..ce265f4ac2 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -432,6 +432,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) +disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5726066) From 309f92ec0915eb35ddad489c246f9d4ec06e6af8 Mon Sep 17 00:00:00 2001 From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Date: Tue, 9 Dec 2025 13:49:41 +0800 Subject: [PATCH 031/172] [None][infra] Use artifactory pypi mirror for Cython install (#9774) Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> --- docker/common/install_mpi4py.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/common/install_mpi4py.sh b/docker/common/install_mpi4py.sh index dd0c3d71a8..e7cad8e1f6 100644 --- a/docker/common/install_mpi4py.sh +++ b/docker/common/install_mpi4py.sh @@ -5,6 +5,7 @@ set -ex GITHUB_URL="https://github.com" if [ -n "${GITHUB_MIRROR}" ]; then GITHUB_URL=${GITHUB_MIRROR} + export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" fi MPI4PY_VERSION="3.1.5" From 252769c9304e92a2d9c98fcb5eb06d2749e51955 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Dec 2025 13:51:30 +0800 Subject: [PATCH 032/172] [TRTLLM-9794][ci] remove duplicated test cases in DGX B200 (#9817) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 - tests/integration/test_lists/test-db/l0_dgx_b200.yml | 4 ---- tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml | 2 +- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 35e60e0436..590f4d92c2 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -596,7 +596,6 @@ class TestLlama3_2_3B(LlmapiAccuracyTestHarness): @pytest.mark.timeout(7200) -@pytest.mark.skip_less_host_memory(1000000) @pytest.mark.skip_less_device_memory(80000) # 1TB is basic requirement for large model tests. CG4 120G only has 800G host memory, and 480G is shared with GPUs. the test will cause the system crash. class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness): diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 04a4278ba6..21dbac3289 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -21,8 +21,6 @@ l0_dgx_b200: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=TRTLLM-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] @@ -61,8 +59,6 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 - - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4] - condition: diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index 5c5bc4132b..d6ad030f10 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -20,6 +20,7 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] @@ -58,7 +59,6 @@ l0_gb200_multi_gpus: stage: post_merge backend: pytorch tests: - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] From fbcf03040f9ab98670a680b384eb13608ca8f7ae Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:00:43 +0800 Subject: [PATCH 033/172] [None][test] Refactor qa/llm_perf_nim.yml test list (#9700) Signed-off-by: yufeiwu <230315618+yufeiwu-nv@users.noreply.github.com> --- .../test_lists/qa/llm_perf_cluster_nim.yml | 141 ---- .../test_lists/qa/llm_perf_nim.yml | 681 +++++++++--------- 2 files changed, 341 insertions(+), 481 deletions(-) delete mode 100644 tests/integration/test_lists/qa/llm_perf_cluster_nim.yml diff --git a/tests/integration/test_lists/qa/llm_perf_cluster_nim.yml b/tests/integration/test_lists/qa/llm_perf_cluster_nim.yml deleted file mode 100644 index b938600890..0000000000 --- a/tests/integration/test_lists/qa/llm_perf_cluster_nim.yml +++ /dev/null @@ -1,141 +0,0 @@ -version: 0.0.1 -llm_perf_cluster_nim: -- condition: - ranges: - system_gpu_count: - gte: 1 - tests: - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8] - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-maxbs:2048-maxnt:8192-input_output_len:256,256-reqs:200] - # for chunked prefill cases - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200] - # Phi-4-multimodal-instruct - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] - #Mistral-Small-3.1-24B-Instruct-2503 - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120) - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200] TIMEOUT(120) - - -- condition: - ranges: - system_gpu_count: - gte: 2 - tests: - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:512,32-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:256-input_output_len:512,32-gpus:2] - - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2] - #Mistral-Small-3.1-24B-Instruct-2503 - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120) - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200-gpus:2] TIMEOUT(120) - -# Tests for systems with 4+ GPUs -- condition: - ranges: - system_gpu_count: - gte: 4 - tests: - - perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4] - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:4-tp:4-gpus:4] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120) - # for chunked prefill cases - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120) - - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4] - #llama_v3.1_405b_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1024,2048-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-gpus:4] TIMEOUT(120) - #llama_v3.3_70b_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1000-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120) - #llama_v4_scout_17b_16e_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:500-gpus:4] - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:500-gpus:4] - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120) - - -# Tests for systems with 8+ GPUs -- condition: - ranges: - system_gpu_count: - gte: 8 - tests: - #llama_v3.3_nemotron_super_49b - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8] - #llama_v3.3_70b_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] - - #llama_v4_scout_17b_16e_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] - - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] - #deepseek_r1_fp8 - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test - #deepseek_r1_nvfp4 - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] TIMEOUT (120) #max throughput test - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test - # for chunked prefill cases - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120) - #deepseek_r1_0528_fp4 - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120) - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40) - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40) - - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8] - #gpt_oss_120b - # max throughput test - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180) - # min latency test - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8] diff --git a/tests/integration/test_lists/qa/llm_perf_nim.yml b/tests/integration/test_lists/qa/llm_perf_nim.yml index 1888fff6db..0b81b2c506 100644 --- a/tests/integration/test_lists/qa/llm_perf_nim.yml +++ b/tests/integration/test_lists/qa/llm_perf_nim.yml @@ -1,395 +1,396 @@ version: 0.0.1 llm_perf_nim: -# one gpu test +# =============================================================================== +# Test Conditions Index +# =============================================================================== +# 1: All GPUs common tests +# 2: A100, L20, L40S, H100, H20, H200 +# 3: A100, L40S, H100, H20, H200 +# 4: A100, H100, H20, H200 test cases +# 5: L40S, H100, H200, H20, B200, B300 test cases +# 6: L40S, H100, H200, H20, GB200, GB300 test cases +# 7: H100, H200, H20 common test cases +# 8: L20, L40S, H100, H200, H20 common test cases +# 9: H20, H200 test cases +# 10: L20, L40S, H100, H200, H20, B200, GB200, B300, GB300 common test cases +# 11: B200, GB200, B300, GB300, RTX6000-Server common test cases +# 12: B200, B300, RTX6000-Server test cases +# 13: B200, GB200, B300, GB300 test cases +# 14: B200, B300 test cases +# =============================================================================== + + +# 1: All GPUs common tests - condition: ranges: system_gpu_count: gte: 1 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*a100*' - - '*l40s*' - - '*l20*' - - '*h20*' tests: - # E2E trtllm-bench - #llama_v3.1_8b_instruct - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32] - # Mistral-7B - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:128,128] - # Phi-4-mini-instruct - # cpp - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-con:250] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-con:250] - # reduced 'reqs' to fit timeout limit - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-reqs:8-con:1] - - -# FP8 specific tests -- condition: - terms: - supports_fp8: true - ranges: - system_gpu_count: - gte: 1 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*l40s*' - - '*l20*' - - '*h20*' - tests: - # Phi-4-mini-instruct - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-con:250] - # reduced 'reqs' to fit timeout limit - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] +# 2: A100, L20, L40S, H100, H20, H200 - condition: ranges: system_gpu_count: gte: 1 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*h20*' + compute_capability: + lt: 10.0 tests: - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-reqs:10-con:1] - # Llama-3.1-Nemotron-Nano-8B-v1 - # cpp backend - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:250] - # pyt backend - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:500-con:250] - # FP8 prequantized pyt backend - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] - #long time llama_nemotron cases - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] # timeout for l20, l40s, a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1] #timeout for l20, l40s, failed for a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-reqs:8-con:1] # timeout for l20, l40s, failed on a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250] # failed for a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:250] # failed on A100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-con:250] # failed on A100 15 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-con:250] # timeout for l20, l40s, a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-con:250] # timeout for l20, l40s, failed on A100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] # failed for l20, need to extend context token to 5000 for l40s and a100, timeout for h20 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250] #need to extend context token to 20000 for l40s, timeout for h20, a100 - -# FP8 specific tests -- condition: - terms: - supports_fp8: true - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*l40s*' - - '*l20*' - - '*h20*' - - '*b200*' - - '*gb200*' - tests: - #llama_v3.1_8b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a16_awq] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a8_awq] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8] - #mistral_7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:1000,1000-quant:fp8] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:500,2000-quant:fp8] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:1-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:250-gpus:2] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:128,128] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-loras:8-gpus:2] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-con:250] + - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] -- condition: - terms: - supports_fp8: true - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*h20*' - tests: - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:1] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:250] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:10-con:250] - - -# 2 gpus test -- condition: - ranges: - system_gpu_count: - gte: 2 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*a100*' - - '*l40s*' - - '*l20*' - - '*h20*' - tests: - #mixtral_8x7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-loras:8-gpus:2] - #llama_v3.2_1b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:1-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:250-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-gpus:2] - #t5 - - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] - -- condition: - ranges: - system_gpu_count: - gte: 2 - gpu_memory: - gt: 80000 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*a100*' - - '*h20*' - tests: - #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] - #mixtral_8x7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2] - -# FP8 specific tests -- condition: - terms: - supports_fp8: true - ranges: - system_gpu_count: - gte: 2 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*l40s*' - - '*l20*' - - '*h20*' - tests: - #llama_v3.2_1b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,32-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,200-quant:fp8-gpus:2] - #mistral_7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8-tp:2] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8-tp:2] - # torch backend - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2] - - -- condition: - terms: - supports_fp8: true - ranges: - system_gpu_count: - gte: 2 - gpu_memory: - gt: 80000 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*h20*' - tests: - #mixtral_8x7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:512,32-quant:fp8-gpus:2] - #llama_v3.2_1b trt backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:2] - -# 4 gpus test +# 3: A100, L40S, H100, H20, H200 - condition: ranges: system_gpu_count: gte: 4 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*a100*' - - '*l40s*' - - '*h20*' + compute_capability: + lt: 10.0 tests: - #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:512,32-gpus:4] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:128,128-gpus:4] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:512,32-gpus:4] -# FP8 specific tests +# 4: A100, H100, H20, H200 test cases +# GPU memory > 80GB - condition: - terms: - supports_fp8: true ranges: system_gpu_count: gte: 4 - wildcards: - gpu: - - '*b200*' - - '*gb200*' - - '*h100*' - - '*h200*' - - '*l40s*' - - '*h20*' + compute_capability: + lt: 10.0 + gpu_memory: + gt: 80000 tests: - #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-tp:4] - # Llama-Nemotron-Super-49B-v3.3 - # cpp - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-con:250-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4] - # pyt - # bfloat16 - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - # fp8 prequantized - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:8-con:1-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2] + +# 5: L40S, H100, H200, H20, B200, B300 test cases - condition: ranges: system_gpu_count: gte: 8 - gpu_memory: - gt: 80000 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*a100*' - - '*h20*' + compute_capability: + gt: 8.0 + lte: 10.3 tests: - # E2E trtllm-bench - #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:8-con:1-gpus:8] # timeout for h20, move to l2 test - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:64-con:250-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:8-con:1-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8] + - perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4] + +# 6: L40S, H100, H200, H20, GB200, GB300 test cases +- condition: + ranges: + system_gpu_count: + gte: 4 + compute_capability: + gt: 8.0 + lte: 10.3 + tests: + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-tp:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-con:250-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] + +# 7: H100, H200, H20 common test cases +- condition: + ranges: + system_gpu_count: + gte: 8 + compute_capability: + gte: 9.0 + lte: 9.0 + tests: + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:2] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:512,32-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:10-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:1] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-reqs:10-con:1] + +# 8: L20, L40S, H100, H200, H20 common test cases +- condition: + ranges: + system_gpu_count: + gte: 2 + compute_capability: + gt: 8.0 + lte: 9.0 + tests: + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,200-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,32-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8-tp:2] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8-tp:2] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:8-con:1] + +# 9: H20, H200 test cases +# gpu_memory > 100GB - condition: ranges: system_gpu_count: gte: 8 gpu_memory: gt: 100000 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*h20*' + compute_capability: + gte: 9.0 + lte: 9.0 tests: - #mixtral_8x7b_v0.1_instruct - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:64-gpus:8] # timeout for a100 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:50-gpus:8] # timeout for a100 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:1-gpus:8] # timeout for a100 - # Llama-3_1-Nemotron-Ultra-253B-v1 - # all cpp backend, bf16->fp8 post-quantized - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:250-con:250-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:250-con:250-tp:8-gpus:8] - # pyt backend, fp8 pre-quantized - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8] - #deepseek_r1_fp8 - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] #min latency test - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] #max throughput test + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:250-con:250-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:250-con:250-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:1-gpus:8] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:50-gpus:8] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:64-gpus:8] -# FP8 specific tests + +# 10: L20, L40S, H100, H200, H20, B200, GB200, B300, GB300 test cases +- condition: + ranges: + system_gpu_count: + gte: 2 + compute_capability: + gt: 8.0 + lte: 10.3 + tests: + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a16_awq] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a8_awq] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:1000,1000-quant:fp8] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:500,2000-quant:fp8] + + +# 11: B200, GB200, B300, GB300, RTX6000-Server common test cases +- condition: + ranges: + system_gpu_count: + gte: 4 + compute_capability: + gte: 10.0 + lte: 12.0 + tests: + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8] + - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-maxbs:2048-maxnt:8192-input_output_len:256,256-reqs:200] + # Phi-4-multimodal-instruct + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] + #Mistral-Small-3.1-24B-Instruct-2503 + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200] + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120) + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:512,32-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:256-input_output_len:512,32-gpus:2] + - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2] + #Mistral-Small-3.1-24B-Instruct-2503 + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2] + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2] + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120) + - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200-gpus:2] TIMEOUT(120) + - perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4] + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:4-tp:4-gpus:4] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120) + #llama_v3.1_405b_instruct_fp4 + - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1024,2048-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-gpus:4] TIMEOUT(120) + #llama_v3.3_70b_instruct_fp4 + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1000-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120) + #llama_v4_scout_17b_16e_instruct_fp4 + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:500-gpus:4] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:500-gpus:4] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120) + + +# 12: B200, B300, RTX6000-Server test cases - condition: - terms: - supports_fp8: true ranges: system_gpu_count: gte: 8 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*l40s*' - - '*h20*' + compute_capability: + gte: 10.0 + lte: 12.0 tests: - - perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4] # timeout for h100 - #llama_v3.3_70b_instruct_fp8 - # FP8 specific tests + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8] + #llama_v3.3_70b_instruct_fp4 + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] + + #llama_v4_scout_17b_16e_instruct_fp4 + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8] + - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] + - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] + #deepseek_r1_fp8 + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test + #deepseek_r1_nvfp4 + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] TIMEOUT (120) #max throughput test + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test + #deepseek_r1_0528_fp4 + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8] + #gpt_oss_120b + # max throughput test + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180) + # min latency test + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8] + + +# 13: B200, GB200, B300, GB300 test cases +- condition: + ranges: + system_gpu_count: + gte: 4 + compute_capability: + gte: 10.0 + lte: 10.3 + tests: + # for chunked prefill cases + - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200] + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120) + - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4] + + +# 14: B200, B300 test cases - condition: - terms: - supports_fp8: true ranges: system_gpu_count: gte: 8 - wildcards: - gpu: - - '*b200*' - - '*h100*' - - '*h200*' - - '*l40s*' - - '*h20*' + compute_capability: + gte: 10.0 + lte: 10.3 tests: - #llama_v3.3_70b_instruct_fp8 - #trt backend - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-reqs:400-con:200-gpus:8-extra] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:8-con:1-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:64-con:250-gpus:8] + # for chunked prefill cases + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120) From 2ddcb45b2aecb0470ed4001a19be19d85ba07ac0 Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Tue, 9 Dec 2025 16:34:17 +0800 Subject: [PATCH 034/172] [None][chore] Generate lock file for release/1.2.0rc4.post1 branch automatically (#9829) Signed-off-by: Yiqing Yan --- jenkins/GenerateLock.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/jenkins/GenerateLock.groovy b/jenkins/GenerateLock.groovy index 1a0f142401..0e2dd89ed1 100644 --- a/jenkins/GenerateLock.groovy +++ b/jenkins/GenerateLock.groovy @@ -114,6 +114,7 @@ pipeline { triggers { parameterizedCron(''' H 2 * * * %branchName=main;repoUrlKey=tensorrt_llm_github + H 3 * * * %branchName=release/1.2.0rc4.post1;repoUrlKey=tensorrt_llm_github ''') } From 76f49c903b050cad48b84f3ccb967693e44451dc Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Tue, 9 Dec 2025 10:41:22 +0100 Subject: [PATCH 035/172] [None][fix] Additional model outputs for pipeline parallelism (#9794) Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 3 ++- .../integration/test_lists/test-db/l0_a10.yml | 2 +- .../test_lists/test-db/l0_dgx_h100.yml | 1 + .../llmapi/test_additional_model_outputs.py | 18 ++++++++++++------ 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 3751dff618..316d23bf2c 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2541,7 +2541,8 @@ class PyExecutor: r for r in previous_batch.sample_state.scheduled_requests.all_requests() if r.state == LlmRequestState.GENERATION_COMPLETE and ( - r.py_return_context_logits or r.py_return_generation_logits) + r.py_return_context_logits or r.py_return_generation_logits + or r.py_additional_outputs is not None) ] if self.dist.is_first_pp_rank and len(finished_reqs): finished_reqs_py_results = [r.py_result for r in finished_reqs] diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 36a5bc32e5..4958202da2 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -74,7 +74,7 @@ l0_a10: - unittest/llmapi/test_serialization.py - unittest/llmapi/test_utils.py - unittest/llmapi/test_llm_args.py - - unittest/llmapi/test_additional_model_outputs.py + - unittest/llmapi/test_additional_model_outputs.py -m "gpu1" # executor - unittest/executor/test_rpc.py # trtllm-serve CPU-only diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index c544501a9d..0eca7d4847 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -16,6 +16,7 @@ l0_dgx_h100: orchestrator: mpi tests: - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu2" + - unittest/llmapi/test_additional_model_outputs.py -m "gpu2" - unittest/_torch/multi_gpu -m "not post_merge" TIMEOUT (90) - unittest/_torch/auto_deploy/unit/multigpu - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism diff --git a/tests/unittest/llmapi/test_additional_model_outputs.py b/tests/unittest/llmapi/test_additional_model_outputs.py index 9e93d2daec..c0e51c95e8 100644 --- a/tests/unittest/llmapi/test_additional_model_outputs.py +++ b/tests/unittest/llmapi/test_additional_model_outputs.py @@ -135,7 +135,7 @@ class DummyConfigLoader(BaseConfigLoader): return ModelConfig(pretrained_config=DummyConfig()) -@pytest.mark.part0 +@pytest.mark.gpu1 def test_additional_model_outputs_sampling_params(): """Test that additional_model_outputs can be configured in SamplingParams.""" # Create sampling params with additional outputs @@ -153,7 +153,7 @@ def test_additional_model_outputs_sampling_params(): assert sampling_params.additional_model_outputs[1] == "generation_output" -@pytest.mark.part0 +@pytest.mark.gpu1 def test_additional_model_outputs_no_outputs(): """Test that no additional outputs are returned when not requested.""" # Create sampling params without additional outputs @@ -166,8 +166,7 @@ def test_additional_model_outputs_no_outputs(): assert sampling_params.additional_model_outputs is None -@pytest.mark.part0 -def test_additional_model_outputs_integration(): +def _test_additional_model_outputs_integration(pp_size: int): """Integration test for additional_model_outputs. This test uses a dummy model to test the additional_model_outputs feature. @@ -186,6 +185,7 @@ def test_additional_model_outputs_integration(): # Create LLM with the provided model llm = LLM(model=_pl.Path("dummy_path"), backend='pytorch', + pipeline_parallel_size=pp_size, max_batch_size=2, max_seq_len=128, max_num_tokens=5, @@ -278,5 +278,11 @@ def test_additional_model_outputs_integration(): expected_generation_output.unsqueeze(1)) -if __name__ == "__main__": - pytest.main([__file__]) +@pytest.mark.gpu1 +def test_additional_model_outputs_integration(): + _test_additional_model_outputs_integration(1) + + +@pytest.mark.gpu2 +def test_additional_model_outputs_integration_pp2(): + _test_additional_model_outputs_integration(2) From d600b9f8513a6802fa0a15bc1ff2f0fcba9d18f1 Mon Sep 17 00:00:00 2001 From: Stefan Niebler <82932102+stnie@users.noreply.github.com> Date: Tue, 9 Dec 2025 10:44:01 +0100 Subject: [PATCH 036/172] [TRTLLM-6756][feat] Update BeamSearch for TorchSampler (#9660) Signed-off-by: Stefan Niebler <82932102+stnie@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/sampler.py | 209 +++++++-------- .../_torch/pyexecutor/sampling_utils.py | 68 +++-- .../pyexecutor/sampling_utils_flashinfer.py | 133 +++++++++- tensorrt_llm/sampling_params.py | 11 +- .../_torch/sampler/test_beam_search.py | 239 +++--------------- 5 files changed, 331 insertions(+), 329 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index ce5a19be88..09d69bb126 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -20,7 +20,7 @@ from collections.abc import Iterable from dataclasses import dataclass from functools import cached_property from itertools import repeat -from typing import Any, Callable, Generic, List, NamedTuple, Optional, Type, TypeVar, cast +from typing import Any, Callable, Generic, List, Optional, Type, TypeVar, cast import numpy as np import torch @@ -62,6 +62,7 @@ from .finish_reason import FinishedState from .llm_request import LlmRequest, LlmRequestState, get_draft_token_length from .resource_manager import ResourceManager, ResourceManagerType from .sampling_utils import ( + BEAM_SEARCH_PAD_TOKEN, GREEDY, BeamSearchMetadata, GenericStrategyKeyType, @@ -200,26 +201,40 @@ class SampleStateWithMMResult: @dataclass(kw_only=True, frozen=True) class RequestGroupKey(Generic[GenericStrategyKeyType]): - strategy: GenericStrategyKeyType + strategy_key: GenericStrategyKeyType speculation_needs_probs: bool def __iter__(self): - return iter((self.strategy, self.speculation_needs_probs)) + return iter((self.strategy_key, self.speculation_needs_probs)) def __len__(self): return 2 -class RequestGroupValue(NamedTuple): +@dataclass(kw_only=True, frozen=True) +class RequestGroupValue: indices: torch.Tensor strategies: list[Strategy] + def __iter__(self): + return iter((self.indices, self.strategies)) -class RequestGroupValueWithMetadata(NamedTuple): - indices: torch.Tensor - strategies: list[Strategy] + def __len__(self): + return 2 + + +@dataclass(kw_only=True, frozen=True) +class RequestGroupValueWithMetadata(RequestGroupValue): metadata: StrategyMetadata + @override + def __iter__(self): + return iter((self.indices, self.strategies, self.metadata)) + + @override + def __len__(self): + return 3 + class EarlyStopWithMMResult(Sampler): """ @@ -325,7 +340,7 @@ def _group_requests_by_strategy_key( strategy_to_key: Callable[[Strategy, bool], GenericStrategyKeyType], pin_memory: bool = False, vocab_size: int, -) -> dict[RequestGroupKey, RequestGroupValue]: +) -> dict[RequestGroupKey[GenericStrategyKeyType], RequestGroupValue]: # NB: Client code relies on request indices in returned torch.Tensor being sorted. group_dict: dict[tuple[GenericStrategyKeyType, bool], tuple[list[int], list[Strategy]]] = ( defaultdict(lambda: ([], [])) @@ -344,7 +359,7 @@ def _group_requests_by_strategy_key( group_dict_entry[1].append(strategy) return { RequestGroupKey( - strategy=group_key[0], speculation_needs_probs=group_key[1] + strategy_key=group_key[0], speculation_needs_probs=group_key[1] ): RequestGroupValue( indices=torch.tensor(indices, pin_memory=pin_memory, dtype=torch.int32), strategies=strategies, @@ -648,7 +663,8 @@ class BeamHistory: cum_logprobs: torch.Tensor | None = None -class SamplingRequestsMetadata(NamedTuple): +@dataclass(kw_only=True) +class SamplingRequestsMetadata: req_num_generated_tokens: torch.Tensor req_num_beams: torch.Tensor req_num_steps: torch.Tensor @@ -716,7 +732,7 @@ class TorchSampler(Sampler): def __post_init__(self): assert self.new_tokens.shape == self.finish_reasons.shape - def create_store(self) -> Store: + def _create_store(self) -> Store: if self._use_beam_search: return self.Store( new_tokens=int_tensor(self.NEW_TOKENS_SHAPE), @@ -771,7 +787,7 @@ class TorchSampler(Sampler): # which would disallow in-place mutating of new_tokens. # So, we temporarily exit inference mode. with torch.inference_mode(False): - self.store = self.create_store() + self.store = self._create_store() # Helper tensors for finish_reasons: """Preallocate buffer needed for torch.nonzero_static(..., out=finish_reasons_nonzero_static_buffer). See `def _write_reason`.""" @@ -791,12 +807,9 @@ class TorchSampler(Sampler): self._grouped_sampler_cls: Type[GroupedStrategySampler] if IS_FLASHINFER_AVAILABLE and not args.disable_flashinfer_sampling: - if self._use_beam_search: # Beam search requires SimpleGroupedStrategySampler - self._grouped_sampler_cls = SimpleGroupedStrategySampler - else: - from .sampling_utils_flashinfer import FlashInferGroupedStrategySampler + from .sampling_utils_flashinfer import FlashInferGroupedStrategySampler - self._grouped_sampler_cls = FlashInferGroupedStrategySampler + self._grouped_sampler_cls = FlashInferGroupedStrategySampler else: self._grouped_sampler_cls = SimpleGroupedStrategySampler @@ -841,7 +854,7 @@ class TorchSampler(Sampler): @staticmethod def _meet_max_token_stop_criteria( - request: LlmRequest, max_seq_len: int, beam_idx: int = 0 + request: LlmRequest, max_seq_len: int, beam_idx: int = DEFAULT_BEAM_IDX ) -> bool: num_tokens = request.get_num_tokens(beam_idx) return (num_tokens - request.py_orig_prompt_len >= request.py_max_new_tokens) or ( @@ -849,7 +862,9 @@ class TorchSampler(Sampler): ) @staticmethod - def _meet_stop_token_criteria(request: LlmRequest, new_token: int, beam_idx: int = 0) -> bool: + def _meet_stop_token_criteria( + request: LlmRequest, new_token: int, beam_idx: int = DEFAULT_BEAM_IDX + ) -> bool: if request.py_stop_words_list: assert isinstance(request.py_stop_words_list, list), ( "request.py_stop_words_list should be a list" @@ -1325,7 +1340,7 @@ class TorchSampler(Sampler): logprobs_tensor: A tensor of shape (beam_width, num_generated_tokens, num_logprobs) logprobs_indices_tensor: A tensor of shape (beam_width, num_generated_tokens, num_logprobs) """ - num_generated_tokens = request.get_num_tokens(0) - request.py_prompt_len + num_generated_tokens = request.max_beam_num_tokens - request.py_prompt_len assert request.py_num_logprobs == 1, "Beam search only supports one logprob per token" logprobs_tensor = torch.empty( ( @@ -1369,7 +1384,7 @@ class TorchSampler(Sampler): arguments: request: The request to create the beam history for """ - num_tokens = request.get_num_tokens(0) + 1 # last token is not yet added + num_tokens = request.max_beam_num_tokens + 1 # last token is not yet added prompt_length = request.py_prompt_len num_generated_tokens = num_tokens - prompt_length num_beams = request.sampling_config.beam_width @@ -1444,7 +1459,6 @@ class TorchSampler(Sampler): self, request: LlmRequest, beam_history: BeamHistory, - finish_reasons: torch.Tensor, ) -> None: """Update the request with the corrected tokens and logprobs for each beam. @@ -1455,7 +1469,6 @@ class TorchSampler(Sampler): """ beam_width = request.sampling_config.beam_width - is_finished = self._check_beam_search_stop_criteria(request, finish_reasons=finish_reasons) assert beam_history.tokens.shape[0] == beam_width, ( f"Beam_history.tokens.shape[0] should equal beam width: \ {beam_history.tokens.shape[0]} != {beam_width}" @@ -1473,86 +1486,70 @@ class TorchSampler(Sampler): f"Beam_history.cum_logprobs.shape[0] should equal beam width: \ {beam_history.cum_logprobs.shape[0]} != {beam_width}" ) - if is_finished: - # Beams that stopped early are filled with end_id tokens. We need to remove those - stopped_due_to_end_id = (finish_reasons[:beam_width] == FinishReason.END_ID.value).to( - device="cuda" + valid_tokens = (beam_history.tokens != BEAM_SEARCH_PAD_TOKEN).sum(dim=-1) + gen_token_list = [] + gen_log_probs_list = [] + for beam_idx in range(beam_width): + gen_token_list.append(beam_history.tokens[beam_idx, : valid_tokens[beam_idx]].tolist()) + if request.py_return_log_probs: + gen_log_probs_list.append( + self._convert_logprobs_tensor_to_list( + beam_history.logprobs_indices[ + beam_idx : beam_idx + 1, : valid_tokens[beam_idx] + ], + beam_history.logprobs[beam_idx : beam_idx + 1, : valid_tokens[beam_idx]], + )[0] + ) + request.set_generated_tokens(gen_token_list) + if request.py_return_log_probs: + # cum_log_probs will not change when padding with end tokens. + # Therefore, we do not need to correct it + request.py_result.set_log_probs( + gen_log_probs_list, cum_log_probs=beam_history.cum_logprobs.tolist() ) - valid_tokens = (beam_history.tokens != request.py_end_id).sum( - dim=-1 - ) + stopped_due_to_end_id - gen_token_list = [] - gen_log_probs_list = [] - for beam_idx in range(beam_width): - gen_token_list.append( - beam_history.tokens[beam_idx, : valid_tokens[beam_idx]].tolist() - ) - if request.py_return_log_probs: - gen_log_probs_list.append( - self._convert_logprobs_tensor_to_list( - beam_history.logprobs_indices[ - beam_idx : beam_idx + 1, : valid_tokens[beam_idx] - ], - beam_history.logprobs[ - beam_idx : beam_idx + 1, : valid_tokens[beam_idx] - ], - )[0] - ) - request.set_generated_tokens(gen_token_list) - if request.py_return_log_probs: - # cum_log_probs will not change when padding with end tokens. - # Therefore, we do not need to correct it - request.py_result.set_log_probs( - gen_log_probs_list, cum_log_probs=beam_history.cum_logprobs.tolist() - ) - else: - request.set_generated_tokens(beam_history.tokens.tolist()) - if request.py_return_log_probs: - # convert logprobs to a list - token_log_probs = self._convert_logprobs_tensor_to_list( - beam_history.logprobs_indices, beam_history.logprobs - ) - request.py_result.set_log_probs( - token_log_probs, cum_log_probs=beam_history.cum_logprobs.tolist() - ) def _add_metadata_to_grouped_requests( self, requests: list[LlmRequest], - grouped_requests: dict[RequestGroupKey, RequestGroupValue], + grouped_requests: dict[RequestGroupKey[GenericStrategyKeyType], RequestGroupValue], seq_slots: torch.Tensor, - seq_lens: torch.Tensor | None = None, - ) -> dict[RequestGroupKey, RequestGroupValueWithMetadata]: - grouped_requests_with_metadata: dict[RequestGroupKey, RequestGroupValueWithMetadata] = {} + seq_lens: torch.Tensor | None, + get_metadata_type_for_group_fn: Callable[[GenericStrategyKeyType], Type[StrategyMetadata]], + ) -> dict[RequestGroupKey[GenericStrategyKeyType], RequestGroupValueWithMetadata]: + grouped_requests_with_metadata: dict[ + RequestGroupKey[GenericStrategyKeyType], RequestGroupValueWithMetadata + ] = {} for key, value in grouped_requests.items(): - match key.strategy: - case ("beam_search", _, _, _): - assert seq_lens is not None, "seq_lens is required for beam search" - metadata = BeamSearchMetadata( - cache_indirection=self.store.cache_indirection, - cache_indirection_buffer=self.store.cache_indirection_buffer, - cum_log_probs=self.store.cum_log_probs, - new_log_probs=self.store.new_log_probs, - seq_slots=seq_slots[grouped_requests[key].indices].to( - device="cuda", dtype=torch.int64, non_blocking=True - ), # Should be on device for beam search, need long for index_copy_ - seq_lens=seq_lens[grouped_requests[key].indices].to( - device="cuda", non_blocking=True - ), # Should be on device for beam search - finished_beams=self.store.first_finish_reasons, - predecessor_beams=self.store.predecessor_beams, - end_ids=torch.tensor( - [ - requests[request_idx].py_end_id - for request_idx in grouped_requests[key].indices - ], - dtype=torch.int32, - ).to( - device="cuda", non_blocking=True - ), # end_ids should be on device for beam search - ) - case _: - metadata = None + metadata_type = get_metadata_type_for_group_fn(key.strategy_key) + if metadata_type is BeamSearchMetadata: + assert seq_lens is not None, "seq_lens is required for beam search" + metadata = BeamSearchMetadata( + cache_indirection=self.store.cache_indirection, + cache_indirection_buffer=self.store.cache_indirection_buffer, + cum_log_probs=self.store.cum_log_probs, + new_log_probs=self.store.new_log_probs, + seq_slots=seq_slots[grouped_requests[key].indices].to( + device="cuda", dtype=torch.int64, non_blocking=True + ), # Should be on device for beam search, need long for index_copy_ + seq_lens=seq_lens[grouped_requests[key].indices].to( + device="cuda", non_blocking=True + ), # Should be on device for beam search + finished_beams=self.store.first_finish_reasons, + predecessor_beams=self.store.predecessor_beams, + end_ids=torch.tensor( + [ + requests[request_idx].py_end_id + for request_idx in grouped_requests[key].indices + ], + dtype=torch.int32, + ).to( + device="cuda", non_blocking=True + ), # end_ids should be on device for beam search + ) + elif metadata_type is None: + metadata = None + else: + raise ValueError(f"Unsupported metadata type: {metadata_type}") grouped_requests_with_metadata[key] = RequestGroupValueWithMetadata( indices=value.indices, strategies=value.strategies, @@ -1580,7 +1577,7 @@ class TorchSampler(Sampler): return longest_stop_word_len > 1 return False - @nvtx_range("maybe_finalize_beams") + @nvtx_range("maybe_create_beam_histories") def _maybe_create_beam_histories( self, requests: list[LlmRequest], @@ -1628,7 +1625,6 @@ class TorchSampler(Sampler): self._finalize_beam( req, beam_histories[req_idx], - finish_reasons=state.host.first_finish_reasons[req.py_seq_slot], ) else: for beam_idx in range(req.sampling_config.beam_width): @@ -1648,7 +1644,6 @@ class TorchSampler(Sampler): self._finalize_beam( req, beam_histories[req_idx], - finish_reasons=state.host.first_finish_reasons[req.py_seq_slot], ) else: for beam_idx in range(req.sampling_config.beam_width): @@ -1708,7 +1703,7 @@ class TorchSampler(Sampler): # necessary for beam search seq_lens_host = ( torch.tensor( - [r.get_num_tokens(0) for r in requests], dtype=torch.int32, pin_memory=True + [r.max_beam_num_tokens for r in requests], dtype=torch.int32, pin_memory=True ) if self._use_beam_search else None @@ -1738,6 +1733,7 @@ class TorchSampler(Sampler): beam_histories = [None] * len(requests) if self._use_beam_search: + assert seq_lens_host is not None, "seq_lens is required for beam search" seq_lens = seq_lens_host.to(device="cuda", non_blocking=True) first_finish_reasons_host = self.store.first_finish_reasons.to( device="cpu", non_blocking=True @@ -1924,6 +1920,7 @@ class TorchSampler(Sampler): cuda_device: torch.device, logits_cuda_indexer: _PackedStepIndexer, req_num_generated_tokens: torch.Tensor, + req_num_steps: torch.Tensor, req_offsets: torch.Tensor, seq_slots: torch.Tensor, seq_lens: Optional[torch.Tensor] = None, @@ -1936,7 +1933,11 @@ class TorchSampler(Sampler): strategy_to_key=self._grouped_sampler_cls.strategy_grouping_key, ) grouped_requests_with_metadata = self._add_metadata_to_grouped_requests( - requests, grouped_requests, seq_slots, seq_lens + requests, + grouped_requests, + seq_slots, + seq_lens, + get_metadata_type_for_group_fn=self._grouped_sampler_cls.get_metadata_type_for_group, ) generator_cuda = self.get_generator(cuda_device) @@ -1994,9 +1995,7 @@ class TorchSampler(Sampler): group_strategies_per_step = [ # convert from per-request to per-step strat - for strat, steps in zip( - group_strategies, req_num_generated_tokens[group_req_indices] - ) + for strat, steps in zip(group_strategies, req_num_steps[group_req_indices]) for _ in range(steps) ] @@ -2549,6 +2548,7 @@ class TorchSampler(Sampler): seq_slots=seq_slots, seq_lens=seq_lens, req_num_generated_tokens=sampling_requests_metadata.req_num_generated_tokens, + req_num_steps=sampling_requests_metadata.req_num_steps, token_dtype=new_tokens_cuda.dtype, ) @@ -2575,6 +2575,7 @@ class TorchSampler(Sampler): temperature=temperature, top_p=top_p, top_k=top_k, + use_beam_search=self._use_beam_search, ) diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py index 573615b42e..b2c660fea7 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py @@ -21,7 +21,7 @@ referring to types like LlmRequest. import abc import sys from dataclasses import dataclass -from typing import Generic, Literal, Optional, TypeAlias, TypeVar, cast +from typing import Generic, Literal, Optional, Type, TypeAlias, TypeVar, cast import torch @@ -44,6 +44,8 @@ GREEDY: Greedy = ("greedy", None) Strategy: TypeAlias = TopK | TopP | Greedy | TopKTopP | TemperatureOnly | BeamSearch +BEAM_SEARCH_PAD_TOKEN = -1 + @dataclass(kw_only=True) class StrategyMetadata: @@ -65,7 +67,16 @@ class BeamSearchMetadata(StrategyMetadata): @dataclass(frozen=True, kw_only=True) class UtilsSamplingParams: - """Subset of tensorrt_llm::runtime::SamplingConfig supported by sampling_utils.""" + """Subset of tensorrt_llm::runtime::SamplingConfig supported by sampling_utils. + + Args: + temperature: The temperature to use for sampling. + top_p: The top-p to use for sampling. + top_k: The top-k to use for sampling. + use_beam_search: Whether to use beam search. + beam_width_in: The beam_width of a request before the sampling step. + beam_width_out: The beam_width of a request after the sampling step. + """ temperature: Optional[float] top_p: Optional[float] @@ -83,10 +94,11 @@ def resolve_sampling_strategy(params: UtilsSamplingParams, *, vocab_size: int) - top_p = params.top_p top_k = params.top_k - if not use_beam_search and SamplingParams.params_imply_greedy_decoding( + if SamplingParams.params_imply_greedy_decoding( temperature=temperature, top_p=top_p, top_k=top_k, + use_beam_search=use_beam_search, ): return GREEDY @@ -271,10 +283,11 @@ def update_cache_indirection_buffer( def beam_search_sampling_batch( logits: torch.Tensor, + *, beam_width_in: int, beam_width_out: int, beam_search_args: BeamSearchMetadata, - temperature: float, + temperature: float | None, generator: Optional[torch.Generator] = None, return_probs: bool = True, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -283,13 +296,13 @@ def beam_search_sampling_batch( """ logits_dim = logits.dim() assert logits_dim == 2, "logits should be 2D: [batch_size * beam_width, vocab_size]" - if temperature != 0: - logits = logits / max(temperature, 1e-5) batch_size, vocab_size = logits.size() batch_size = batch_size // beam_width_in # compute probability distribution logits = logits.view(batch_size, beam_width_in, vocab_size) + if temperature is not None and temperature != 0: + logits = logits / max(temperature, 1e-5) softmax: Optional[torch.Tensor] = None if return_probs: softmax = torch.softmax(logits, dim=-1) @@ -322,15 +335,8 @@ def beam_search_sampling_batch( # we can now use torch.where to fill the logprobs of the finished beams with -inf asynchronously logprobs = torch.where(finished_beams_mask_expanded, float("-inf"), logprobs) - - # get the offsets of the end tokens in the logprobs tensor - # NB: Modulo vocab size is necessary to prevent end_ids from being out of bounds (e.g. -1) - index = beam_search_args.end_ids.view(-1, 1, 1).expand(-1, beam_width_in, 1) % vocab_size - # Turn the mask into a tensor of 0s and 1s for multiplication - # NB: we use int32 because float(-inf) * 0 returns nan instead of 0 in the scatter_reduce_ - src = (~finished_beams_mask).to(torch.int32).unsqueeze(-1) - # multiply the end_id logprob of finished beams with 0, other beams multiply with 1 - logprobs.view(torch.int32).scatter_reduce_(2, index, src, "prod") + # set the first token to 0 for finished beams. We will overwrite sampling with a padding token later. + logprobs[..., 0] = torch.where(finished_beams_mask, 0, logprobs[..., 0]) # Add the current cum_log_probs to the logprobs of each beam logprobs += beam_search_args.cum_log_probs.unsqueeze(-1)[ @@ -354,9 +360,8 @@ def beam_search_sampling_batch( max_beam_width = beam_search_args.finished_beams.size(1) finished_beams = beam_search_args.finished_beams[beam_search_args.seq_slots].view(-1) - offset_predecessor_beam = ( - predecessor_beam - + torch.arange(predecessor_beam.size(0), device=predecessor_beam.device).unsqueeze(1) + offset_predecessor_beam = predecessor_beam + ( + torch.arange(predecessor_beam.size(0), device=predecessor_beam.device).unsqueeze(1) * max_beam_width ) finished_beams = finished_beams[offset_predecessor_beam] @@ -403,6 +408,9 @@ def beam_search_sampling_batch( # project the next_tokens values to the vocab_size next_tokens = next_tokens % vocab_size + ended_predecessor_mask = torch.gather(dim=1, index=predecessor_beam, input=finished_beams_mask) + # set the finished beams to the pad token + next_tokens = torch.where(ended_predecessor_mask, BEAM_SEARCH_PAD_TOKEN, next_tokens) # update the logprobs of the newly generated tokens # NB this is not needed if logprobs are not returned @@ -523,6 +531,13 @@ class GroupedStrategySampler(Generic[GenericStrategyKeyType], abc.ABC): def strategy_grouping_key(strategy: Strategy, return_probs: bool) -> GenericStrategyKeyType: raise NotImplementedError + @staticmethod + @abc.abstractmethod + def get_metadata_type_for_group( + strategy_key: GenericStrategyKeyType, + ) -> Type[StrategyMetadata] | None: + raise NotImplementedError + @staticmethod @abc.abstractmethod def sample_grouped_strategies( @@ -546,6 +561,17 @@ class SimpleGroupedStrategySampler(GroupedStrategySampler[Strategy]): def strategy_grouping_key(strategy: Strategy, return_probs: bool) -> STRATEGY_KEY_TYPE: return strategy + @override + @staticmethod + def get_metadata_type_for_group( + strategy_key: STRATEGY_KEY_TYPE, + ) -> Type[StrategyMetadata] | None: + match strategy_key: + case ("beam_search", _, _, _): + return BeamSearchMetadata + case _: + return None + @override @staticmethod def sample_grouped_strategies( @@ -558,8 +584,12 @@ class SimpleGroupedStrategySampler(GroupedStrategySampler[Strategy]): return_probs: bool, group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if group_key[0] == "beam_search": + beam_width_in = group_key[1] + else: + beam_width_in = 1 if group_logit_indices is None: - assert logits.size(0) == len(strategies) + assert logits.size(0) == beam_width_in * len(strategies) else: logits = logits[group_logit_indices] diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py index 37b3fcc132..786c953b0f 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py +++ b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py @@ -33,6 +33,8 @@ else: from ..flashinfer_utils import get_env_enable_pdl from .sampling_utils import ( GREEDY, + BeamSearch, + BeamSearchMetadata, GroupedStrategySampler, Strategy, StrategyMetadata, @@ -40,6 +42,7 @@ from .sampling_utils import ( TopK, TopKTopP, TopP, + beam_search_sampling_batch, greedy_search_sampling_batch, ) @@ -65,6 +68,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: pass @@ -169,6 +173,66 @@ class _StrategyImpls: new_tokens = cls._sample_from_probs(probs, generator=generator) return new_tokens, probs + class BeamSearchMixin(StrategyImpl): + def __init__( + self, + beam_width_in: torch.Tensor, + beam_width_out: torch.Tensor, + temperature: torch.Tensor, + ): + self._beam_width_in = beam_width_in + self._beam_width_out = beam_width_out + self._temperature = temperature + + @override + @classmethod + def from_strategies( + cls, strategies: list[Strategy], cuda_device: torch.device + ) -> "_StrategyImpls.BeamSearchMixin": + assert all(strat[0] == "beam_search" for strat in strategies) + narrowed_strats = cast(list[BeamSearch], strategies) + beam_width_in = cls._make_tensor( + [strat[1] for strat in narrowed_strats], torch.int32, cuda_device + ) + beam_width_out = cls._make_tensor( + [strat[2] for strat in narrowed_strats], torch.int32, cuda_device + ) + temperature = cls._make_tensor( + [strat[3] or 1.0 for strat in narrowed_strats], torch.float32, cuda_device + ) + return cls(beam_width_in, beam_width_out, temperature) + + @override + def sample( + self, + logits: torch.Tensor, + *, + group_logit_indices: Optional[torch.Tensor] = None, + generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + assert group_metadata is not None and isinstance(group_metadata, BeamSearchMetadata), ( + "BeamSearchMetadata is required for beam_search_sampling_batch" + ) + assert torch.unique(self._beam_width_in).numel() == 1, ( + "beam_width_in must be the same for all strategies" + ) + assert torch.unique(self._beam_width_out).numel() == 1, ( + "beam_width_out must be the same for all strategies" + ) + logits = self._prepare_logits_with_temperature( + logits, group_logit_indices, self._temperature + ) + return beam_search_sampling_batch( + logits, + beam_width_in=self._beam_width_in[0], + beam_width_out=self._beam_width_out[0], + beam_search_args=group_metadata, + temperature=None, + generator=generator, + return_probs=self.computes_probs(), + ) + class StrategyImplWithProbs(StrategyImpl): @override @classmethod @@ -191,6 +255,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: return self._sample_greedy_with_probs(logits, group_logit_indices=group_logit_indices) @@ -225,6 +290,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: new_tokens, probs = self._sample_with_probs( logits, @@ -263,6 +329,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: new_tokens, probs = self._sample_with_probs( logits, @@ -301,6 +368,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: new_tokens, probs = self._sample_with_probs( logits, @@ -335,6 +403,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: new_tokens, probs = self._sample_with_probs( logits, @@ -346,6 +415,9 @@ class _StrategyImpls: ) return new_tokens, probs + class BeamSearchWithProbs(BeamSearchMixin, StrategyImplWithProbs): + pass + class StrategyImplSampleOnly(StrategyImpl): @override @classmethod @@ -368,6 +440,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: if group_logit_indices is not None: logits = logits[group_logit_indices] @@ -404,6 +477,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: logits = self._prepare_logits_with_temperature( logits, group_logit_indices, self._temperature @@ -450,6 +524,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: probs = self._prepare_probs_with_temperature( logits, group_logit_indices, self._temperature @@ -494,6 +569,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: probs = self._prepare_probs_with_temperature( logits, group_logit_indices, self._temperature @@ -534,6 +610,7 @@ class _StrategyImpls: *, group_logit_indices: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, + group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: logits = self._prepare_logits_with_temperature( logits, group_logit_indices, self._temperature @@ -551,6 +628,37 @@ class _StrategyImpls: ) return new_tokens, None + class BeamSearchSampleOnly(BeamSearchMixin, StrategyImplSampleOnly): + pass + + +def _create_beam_search_specialized_cls( + beam_width_in: torch.Tensor, + beam_width_out: torch.Tensor, + return_probs: bool, +) -> Type[_StrategyImpls.BeamSearchMixin]: + """Create a class that implements BeamSearchMixin with static parameters for grouping.""" + + class BeamSearchSpecialized( + _StrategyImpls.BeamSearchWithProbs if return_probs else _StrategyImpls.BeamSearchSampleOnly + ): + static_beam_width_in = beam_width_in + static_beam_width_out = beam_width_out + + @override + def __hash__(self) -> int: + return hash((super(), self.static_beam_width_in, self.static_beam_width_out)) + + @override + def __eq__(self, other: object) -> bool: + return ( + super().__eq__(other) + and self.static_beam_width_in == other.static_beam_width_in + and self.static_beam_width_out == other.static_beam_width_out + ) + + return BeamSearchSpecialized + class FlashInferGroupedStrategySampler(GroupedStrategySampler[Type[_StrategyImpls.StrategyImpl]]): """Implements batched sampling with FlashInfer.sampling kernels. @@ -576,6 +684,8 @@ class FlashInferGroupedStrategySampler(GroupedStrategySampler[Type[_StrategyImpl return _StrategyImpls.TemperatureOnlyWithProbs case ("greedy", None): return _StrategyImpls.GreedyWithProbs + case ("beam_search", beam_width_in, beam_width_out, _): + return _create_beam_search_specialized_cls(beam_width_in, beam_width_out, True) else: match strategy: case ("top_p", _, _): @@ -588,6 +698,18 @@ class FlashInferGroupedStrategySampler(GroupedStrategySampler[Type[_StrategyImpl return _StrategyImpls.TemperatureOnlySampleOnly case ("greedy", None): return _StrategyImpls.GreedySampleOnly + case ("beam_search", beam_width_in, beam_width_out, _): + return _create_beam_search_specialized_cls(beam_width_in, beam_width_out, False) + + @override + @staticmethod + def get_metadata_type_for_group( + strategy_key: STRATEGY_KEY_TYPE, + ) -> Type[StrategyMetadata] | None: + if issubclass(strategy_key, _StrategyImpls.BeamSearchMixin): + return BeamSearchMetadata + else: + return None @override @staticmethod @@ -601,10 +723,14 @@ class FlashInferGroupedStrategySampler(GroupedStrategySampler[Type[_StrategyImpl return_probs: bool, group_metadata: StrategyMetadata | None = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if group_logit_indices is None: - assert logits.size(0) == len(strategies) + if hasattr(group_key, "static_beam_width_in"): + beam_width_in = group_key.static_beam_width_in else: - assert group_logit_indices.size(0) == len(strategies) + beam_width_in = 1 + if group_logit_indices is None: + assert logits.size(0) == beam_width_in * len(strategies) + else: + assert group_logit_indices.size(0) == beam_width_in * len(strategies) assert return_probs == group_key.computes_probs() @@ -613,4 +739,5 @@ class FlashInferGroupedStrategySampler(GroupedStrategySampler[Type[_StrategyImpl logits, group_logit_indices=group_logit_indices, generator=generator, + group_metadata=group_metadata, ) diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py index c9d6e1f44b..57bebba45e 100644 --- a/tensorrt_llm/sampling_params.py +++ b/tensorrt_llm/sampling_params.py @@ -337,9 +337,13 @@ class SamplingParams: # bindings.SamplingConfig (not SamplingParams). @staticmethod def params_imply_greedy_decoding( - *, temperature: Optional[float], top_p: Optional[float], top_k: Optional[int] + *, + temperature: Optional[float], + top_p: Optional[float], + top_k: Optional[int], + use_beam_search: bool | None, ): - return ( + return (not use_beam_search) and ( (temperature is None and top_p is None and top_k is None) or top_k == 1 or top_p == 0.0 @@ -348,10 +352,11 @@ class SamplingParams: @property def _greedy_decoding(self) -> bool: - return not self.use_beam_search and self.params_imply_greedy_decoding( + return self.params_imply_greedy_decoding( temperature=self.temperature, top_p=self.top_p, top_k=self.top_k, + use_beam_search=self.use_beam_search, ) @property diff --git a/tests/unittest/_torch/sampler/test_beam_search.py b/tests/unittest/_torch/sampler/test_beam_search.py index b9dc52b6c8..5a8d0fe248 100644 --- a/tests/unittest/_torch/sampler/test_beam_search.py +++ b/tests/unittest/_torch/sampler/test_beam_search.py @@ -20,7 +20,7 @@ import torch from test_beam_search_util import (BeamSearchTestOutput, DummyConfigLoader, DummyWeightLoader, get_expected_outputs) from utils.llm_data import llm_models_root -from utils.util import force_ampere +from utils.util import assert_no_cuda_sync, force_ampere from tensorrt_llm import LLM, SamplingParams from tensorrt_llm._torch.models.checkpoints import HfCheckpointLoader @@ -28,7 +28,8 @@ from tensorrt_llm._torch.pyexecutor.llm_request import (LlmRequest, SamplingConfig) from tensorrt_llm._torch.pyexecutor.sampler import BeamHistory, TorchSampler from tensorrt_llm._torch.pyexecutor.sampling_utils import ( - BeamSearchMetadata, FinishReason, beam_search_sampling_batch) + BEAM_SEARCH_PAD_TOKEN, BeamSearchMetadata, FinishReason, + beam_search_sampling_batch) from tensorrt_llm.executor import RequestError from tensorrt_llm.executor.result import CompletionOutput, GenerationResult from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig @@ -44,13 +45,16 @@ def fixed_params(): return {"max_tokens": 8, "max_beam_width": 2} -@pytest.fixture(scope="module", params=["TRTLLMSampler", "TorchSampler"]) -def sampler_type(request): +@pytest.fixture(scope="module", + params=[("TRTLLMSampler", False), ("TorchSampler", False), + ("TorchSampler", True)]) +def sampling_information(request): return request.param @pytest.fixture(scope="module") -def model_kwargs(fixed_params, sampler_type) -> dict[str, Any]: +def model_kwargs(fixed_params, sampling_information) -> dict[str, Any]: + assert fixed_params[ "max_beam_width"] == 2, "This test only works for a beam width of 2" return dict( @@ -59,7 +63,8 @@ def model_kwargs(fixed_params, sampler_type) -> dict[str, Any]: weight_loader=DummyWeightLoader(), config_loader=DummyConfigLoader(), ), - sampler_type=sampler_type, + sampler_type=sampling_information[0], + disable_flashinfer_sampling=sampling_information[1], ) @@ -273,7 +278,7 @@ def test_beam_search_e2e( return_context_logits=gather_context_logits, return_generation_logits=gather_generation_logits, logprobs=return_log_probs, - end_id=999, + end_id=-1, additional_model_outputs=["cache_indirection"], ) validate_outputs(llm, input_prompts[:num_prompts], sampling_params) @@ -319,7 +324,7 @@ def test_beam_search_e2e_cuda_graph_and_overlap( return_context_logits=gather_context_logits, return_generation_logits=gather_generation_logits, logprobs=return_log_probs, - end_id=999, + end_id=-1, stop_token_ids=stop_token_ids, additional_model_outputs=["cache_indirection"], ) @@ -424,15 +429,16 @@ def test_beam_search_sampling_batch_basic(): ) # Run beam search sampling - next_tokens, softmax = beam_search_sampling_batch( - logits=logits, - beam_width_in=beam_width, - beam_width_out=beam_width, - beam_search_args=beam_search_args, - temperature=temperature, - generator=None, - return_probs=True, - ) + with assert_no_cuda_sync(): + next_tokens, softmax = beam_search_sampling_batch( + logits=logits, + beam_width_in=beam_width, + beam_width_out=beam_width, + beam_search_args=beam_search_args, + temperature=temperature, + generator=None, + return_probs=True, + ) # Validate output shapes expected_tokens_shape = (batch_size, beam_width) @@ -443,8 +449,11 @@ def test_beam_search_sampling_batch_basic(): f"Expected shape {expected_softmax_shape}, got {softmax.shape}") # Validate tokens are within vocab range - assert torch.all(next_tokens >= 0) and torch.all( - next_tokens < vocab_size), "Tokens out of vocab range" + assert torch.all(next_tokens[1:] >= 0), "Tokens out of vocab range" + # First request has finished beams. Some beams may have BEAM_SEARCH_PAD_TOKEN (-1) as a token + assert torch.all( + next_tokens[0] >= BEAM_SEARCH_PAD_TOKEN), "Tokens out of vocab range" + assert torch.all(next_tokens < vocab_size), "Tokens out of vocab range" # Validate softmax probabilities sum to 1 torch.testing.assert_close(softmax.sum(dim=-1), @@ -521,7 +530,7 @@ def test_beam_search_sampling_batch_basic(): torch.tensor(predecessor_beam, dtype=torch.int32)) -def get_default_request(test_params: GeneralTestParams) -> LlmRequest: +def create_default_request(test_params: GeneralTestParams) -> LlmRequest: sampling_params = SamplingParams(n=test_params.beam_width, best_of=test_params.beam_width, use_beam_search=True) @@ -537,7 +546,7 @@ def get_default_request(test_params: GeneralTestParams) -> LlmRequest: is_streaming=False) -def get_default_sampler(test_params: GeneralTestParams) -> TorchSampler: +def create_default_sampler(test_params: GeneralTestParams) -> TorchSampler: sampler = TorchSampler( TorchSampler.Args( max_seq_len=test_params.max_seq_len, @@ -572,8 +581,8 @@ def test_create_beam_history(): the cache_indirection backwards to obtain the correct token sequence. """ test_params = GeneralTestParams() - request = get_default_request(test_params) - sampler = get_default_sampler(test_params) + request = create_default_request(test_params) + sampler = create_default_sampler(test_params) # Extract parameters from the test parameters beam_width = test_params.beam_width @@ -700,11 +709,9 @@ def test_finish_beams(): end_id = test_params.end_id batch_size = test_params.batch_size vocab_size = test_params.vocab_size - test_params.max_batch_size - max_beam_width = test_params.max_beam_width num_logprobs = 1 - request = get_default_request(test_params) - sampler = get_default_sampler(test_params) + request = create_default_request(test_params) + sampler = create_default_sampler(test_params) store_device = sampler.store.cache_indirection.device request.set_generated_tokens( @@ -732,12 +739,8 @@ def test_finish_beams(): cum_logprobs != 0 ), "Log probs and cumulative log probs must not only contain zeros. Otherwise change the seed." - tokens[batch_size - 1, 0, - num_generated_tokens // 2:] = end_id # simulate early finished beam - finish_reasons_stop_words = torch.ones( - max_beam_width, dtype=torch.int32) * FinishReason.STOP_WORDS.value - finish_reasons_end_id = torch.ones( - max_beam_width, dtype=torch.int32) * FinishReason.END_ID.value + tokens[batch_size - 1, 0, num_generated_tokens // + 2:] = BEAM_SEARCH_PAD_TOKEN # simulate early finished beam for batch_idx in range(batch_size): beam_history = BeamHistory( @@ -749,54 +752,17 @@ def test_finish_beams(): if batch_idx < batch_size - 1: # requests are not finished yet - sampler._finalize_beam(request, - beam_history, - finish_reasons=torch.zeros( - max_beam_width, dtype=torch.int32)) + sampler._finalize_beam(request, beam_history) final_tokens = torch.tensor(request.get_tokens(), device=store_device, dtype=torch.int32)[:, prompt_len:] torch.testing.assert_close(final_tokens, tokens[batch_idx, :beam_width]) - - # requests are finished by STOP_WORDS - sampler._finalize_beam(request, - beam_history, - finish_reasons=finish_reasons_stop_words) - final_tokens = torch.tensor(request.get_tokens(), - device=store_device, - dtype=torch.int32)[:, prompt_len:] - torch.testing.assert_close(final_tokens, - tokens[batch_idx, :beam_width]) - # requests are finished by END_ID - sampler._finalize_beam(request, - beam_history, - finish_reasons=finish_reasons_end_id) - final_tokens = torch.tensor(request.get_tokens(), - device=store_device, - dtype=torch.int32)[:, prompt_len:] - torch.testing.assert_close(final_tokens, - tokens[batch_idx, :beam_width]) - # Test the case where end_ids are present in the output else: - # requests are not finished yet - sampler._finalize_beam(request, - beam_history, - finish_reasons=torch.zeros( - max_beam_width, dtype=torch.int32)) - final_tokens = torch.tensor(request.get_tokens(), - device=store_device, - dtype=torch.int32)[:, prompt_len:] - torch.testing.assert_close(final_tokens, - tokens[batch_idx, :beam_width]) + sampler._finalize_beam(request, beam_history) - # requests are finished by STOP_WORDS - sampler._finalize_beam(request, - beam_history, - finish_reasons=finish_reasons_stop_words) - - # Given input for beam 0: [ token, token, ..., token, end_id, end_id, ..., end_id] + # Given input for beam 0: [ token, token, ..., token, BEAM_SEARCH_PAD_TOKEN, BEAM_SEARCH_PAD_TOKEN, ..., BEAM_SEARCH_PAD_TOKEN] # Expected output for beam 0: [ token, token, ..., token] final_tokens_1p = torch.tensor(request.get_tokens()[1:], device=store_device, @@ -812,133 +778,6 @@ def test_finish_beams(): final_tokens_0, tokens[batch_idx, 0, :num_generated_tokens // 2]) - # requests are finished by END_ID - sampler._finalize_beam(request, - beam_history, - finish_reasons=finish_reasons_end_id) - - # Given input for beam 0: [ token, token, ..., token, end_id, end_id, ..., end_id] - # Expected output for beam 0: [ token, token, ..., token, end_id] - final_tokens_1p = torch.tensor(request.get_tokens()[1:], - device=store_device, - dtype=torch.int32)[:, prompt_len:] - final_tokens_0 = torch.tensor(request.get_tokens()[0], - device=store_device, - dtype=torch.int32)[prompt_len:] - torch.testing.assert_close(final_tokens_1p, tokens[batch_idx, - 1:beam_width]) - torch.testing.assert_close(final_tokens_0.shape[0], - num_generated_tokens // 2 + 1) - torch.testing.assert_close( - final_tokens_0[:-1], tokens[batch_idx, - 0, :num_generated_tokens // 2]) - torch.testing.assert_close(final_tokens_0[-1].item(), end_id) - - -@force_ampere # Save H100 resource -class TestParameterValidation: - """Ensure that unsupported request parameters do not crash/hang the engine.""" - - @pytest.fixture(scope="module") - @staticmethod - def fixed_params(): - return {"max_tokens": 8, "max_beam_width": 4} - - @pytest.fixture(scope="module") - @staticmethod - def model_kwargs() -> dict[str, Any]: - root = llm_models_root() - assert root is not None - return dict(model=root / "llama-models-v2" / - "TinyLlama-1.1B-Chat-v1.0", ) - - # NB: Class-level fixture overrides do not work without this - @pytest.fixture(scope="module") - @staticmethod - def llm(fixed_params, input_prompts, model_kwargs): - return _build_llm(fixed_params, input_prompts, model_kwargs) - - def _check_engine_responds(self, llm: LLM, input_prompts: list[str], - fixed_params: dict): - _ = llm.generate(input_prompts, - sampling_params=SamplingParams( - max_tokens=fixed_params["max_tokens"], - n=1, - best_of=fixed_params["max_beam_width"], - use_beam_search=True, - end_id=-1, - )) - - @pytest.mark.timeout(120) - @pytest.mark.threadleak(enabled=False) - def test_use_beam_search_false( - self, - llm: LLM, - input_prompts: list[str], - fixed_params: dict, - ): - assert fixed_params["max_beam_width"] > 2 - with pytest.raises( - ValueError, - match= - ".*Greedy decoding in the LLM API does not allow multiple returns.*" - ): - _ = llm.generate(input_prompts, - sampling_params=SamplingParams( - max_tokens=fixed_params["max_tokens"], - n=1, - best_of=fixed_params["max_beam_width"], - use_beam_search=False, - end_id=-1, - )) - self._check_engine_responds(llm, input_prompts, fixed_params) - - @pytest.mark.timeout(120) - @pytest.mark.threadleak(enabled=False) - def test_use_beam_search_ommitted( - self, - llm: LLM, - input_prompts: list[str], - fixed_params: dict, - ): - assert fixed_params["max_beam_width"] > 2 - with pytest.raises( - ValueError, - match= - ".*Greedy decoding in the LLM API does not allow multiple returns.*" - ): - _ = llm.generate(input_prompts, - sampling_params=SamplingParams( - max_tokens=fixed_params["max_tokens"], - n=1, - best_of=fixed_params["max_beam_width"], - end_id=-1, - )) - self._check_engine_responds(llm, input_prompts, fixed_params) - - @pytest.mark.timeout(120) - @pytest.mark.threadleak(enabled=False) - def test_smaller_beam_width( - self, - llm: LLM, - input_prompts: list[str], - fixed_params: dict, - ): - assert fixed_params["max_beam_width"] > 2 - with pytest.raises( - RequestError, - match=".*Request beam width 2 is not equal to max_beam_width 4*" - ): - _ = llm.generate(input_prompts, - sampling_params=SamplingParams( - max_tokens=fixed_params["max_tokens"], - n=1, - best_of=2, - use_beam_search=True, - end_id=-1, - )) - self._check_engine_responds(llm, input_prompts, fixed_params) - @force_ampere # Save H100 resource class TestParameterValidation: From 58c29957d9017c33b33980eb7c924e2b64c4aca5 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 9 Dec 2025 17:58:25 +0800 Subject: [PATCH 037/172] [TRTLLM-9794][ci] move qwen3-next test cases to gb200 (#9827) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_dgx_b200.yml | 7 ------- .../test_lists/test-db/l0_gb200_multi_gpus.yml | 8 +++++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 21dbac3289..f46e61aa7a 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -40,13 +40,6 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION - - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4] - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index d6ad030f10..2d710b2888 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -44,7 +44,13 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[fp8] - + - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] + - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] - condition: ranges: system_gpu_count: From 75bc386b6501215090582deef0522da51cb19e6a Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Tue, 9 Dec 2025 19:39:29 +0800 Subject: [PATCH 038/172] [None][infra] Waive failed cases for main branch on 12/09 (#9839) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ce265f4ac2..5bdf12de44 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -442,3 +442,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) +unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_streaming SKIP (https://nvbugs/5720482) +unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) +perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k] SKIP (https://nvbugs/5727481) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475) From 3156f2e8525d741f2b783c5f9dd1e4148b44485f Mon Sep 17 00:00:00 2001 From: Dom Brown <3886319+DomBrown@users.noreply.github.com> Date: Tue, 9 Dec 2025 13:37:55 +0000 Subject: [PATCH 039/172] [https://nvbugs/5575841] [fix] Nvbug 5575841: Remove additional test waivers for TestMoEFP4 (#9788) Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com> --- tests/unittest/_torch/thop/serial/test_moe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py index 83dad144cf..99ae844fc6 100644 --- a/tests/unittest/_torch/thop/serial/test_moe.py +++ b/tests/unittest/_torch/thop/serial/test_moe.py @@ -1056,7 +1056,6 @@ class TestMoeFp4: ) def test_autotune(self, num_tokens, hidden_size, intermediate_size, routing_info): - pytest.skip("https://nvbugs/5575841") self.run_moe_fp4_test(num_tokens, hidden_size, @@ -1139,7 +1138,6 @@ class TestMoeFp4: ids=["use_score_as_input", "use_topk_as_input"]) def test_no_autotune(self, num_tokens, hidden_size, intermediate_size, routing_info, use_topk_as_input): - pytest.skip("https://nvbugs/5575841") self.run_moe_fp4_test(num_tokens, hidden_size, From 07c76a5facba7ed52fc2a7eab715b787cd01731a Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Tue, 9 Dec 2025 11:06:31 -0500 Subject: [PATCH 040/172] [None][feat] Make 2-model spec dec use the 1-model kernels (Hopper) (#8810) Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> --- .../_torch/pyexecutor/model_engine.py | 2 +- tensorrt_llm/_torch/speculative/interface.py | 41 ++++++++----------- .../speculative/test_draft_len_schedule.py | 2 + .../_torch/speculative/test_eagle3.py | 4 +- 4 files changed, 21 insertions(+), 28 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index aaac2256c9..5dfbe7c9a2 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -2643,7 +2643,7 @@ class PyTorchModelEngine(ModelEngine): # attn_metadata now depends on spec_metadata since it determines the shape/content of spec_dec parameter Tensors is_spec_dec_mode = spec_metadata.spec_dec_mode.attention_need_spec_dec_mode( spec_resource_manager, self.is_draft_model, self.attn_backend, - self.model_is_wrapped, spec_metadata.is_spec_dec_tree) + self.model_is_wrapped) attn_metadata.update_spec_dec_param( batch_size=scheduled_requests.batch_size, is_spec_decoding_enabled=is_spec_dec_mode, diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index bb301cfcb7..a02640f420 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -136,21 +136,15 @@ class SpeculativeDecodingMode(IntEnum): # 1-model has separate logic for handling draft tokens return False - if issubclass(attention_backend, - TrtllmAttention) and self.is_mtp_eagle(): - # TRTLLM MLA does not work with the chunked context mode. - return False - return not issubclass(attention_backend, - TrtllmAttention) or get_sm_version() != 100 + TrtllmAttention) or get_sm_version() < 90 def attention_need_spec_dec_mode( - self, - spec_resource_manager: BaseResourceManager, - is_draft_model: bool, - attention_backend: Type[AttentionBackend], - use_chain_drafter: bool, # CDL - is_spec_dec_tree: bool, + self, + spec_resource_manager: Optional[BaseResourceManager], + is_draft_model: bool, + attention_backend: Type[AttentionBackend], + use_chain_drafter: bool, # CDL ): """ If true, the attention backend kernel needs to run in spec-dec mode (multi-token query mode). @@ -159,22 +153,19 @@ class SpeculativeDecodingMode(IntEnum): is_draft_model: whether the model is a draft model. attention_backend: the attention backend. use_chain_drafter: whether to use capturable drafting loops (CDL). For the target model, it is always False. - is_spec_dec_tree: whether the spec-dec mode is a tree, i.e., static tree or dynamic tree. """ is_trtllm_attention = issubclass(attention_backend, TrtllmAttention) - # Case 1: one model - use_case_1 = self.is_eagle3_one_model() - # Case 2: eagle3 two model + draft model + CDL + is_first_draft + TRTLLM attention - use_case_2 = self.is_eagle3( - ) and spec_resource_manager.is_first_draft and use_chain_drafter and is_draft_model and is_trtllm_attention - # Case 3: eagle3 two model + tree decoding + draft model + CDL + TRTLLM attention - use_case_3 = self.is_eagle3( - ) and is_spec_dec_tree and is_draft_model and use_chain_drafter and is_trtllm_attention - # Case 4: eagle3 two model + tree decoding + target model + TRTLLM attention - use_case_4 = self.is_eagle3( - ) and is_spec_dec_tree and not is_draft_model and is_trtllm_attention - return use_case_1 or use_case_2 or use_case_3 or use_case_4 + # Always use the multi-token query mode for 1-model. + # For 2-model, we need to enable it when we process multiple tokens at once. This occurs with + # the target model (verification) or on the first draft for CDL based speculation. + use_case_1 = self.is_eagle3_one_model() + use_case_2 = (not is_draft_model or + (spec_resource_manager is not None + and spec_resource_manager.is_first_draft + and use_chain_drafter)) and is_trtllm_attention + + return use_case_1 or use_case_2 @staticmethod def from_string(name: Optional[str]) -> "SpeculativeDecodingMode": diff --git a/tests/unittest/_torch/speculative/test_draft_len_schedule.py b/tests/unittest/_torch/speculative/test_draft_len_schedule.py index 6d67c79e14..dc4aa57764 100644 --- a/tests/unittest/_torch/speculative/test_draft_len_schedule.py +++ b/tests/unittest/_torch/speculative/test_draft_len_schedule.py @@ -29,6 +29,7 @@ def enforce_single_worker(): # # ============================================================================ # # test 1: Generation correctness check # # ============================================================================ +@pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5680911") @pytest.mark.parametrize( "drafter_type,schedule", [ @@ -150,6 +151,7 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict): ], ) @pytest.mark.high_cuda_memory +@pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5680911") def test_draft_len_schedule_functionality(drafter_type: str, draft_schedule: dict): if not torch.cuda.is_available(): pytest.skip("CUDA not available") diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index cab50b9789..41a60d579f 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -206,7 +206,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, num_tokens = len(new_tokens) accept_rate = num_accepted / num_drafted - assert accept_rate > 0.15 + assert accept_rate > 0.10 # Output tests sampling_params = SamplingParams(max_tokens=10, temperature=0) @@ -252,7 +252,7 @@ def test_llama_eagle3_long_prompt(use_cuda_graph): speculative_config=spec_config, max_batch_size=1, cuda_graph_config=cuda_graph_config, - disable_overlap_scheduler=False) + disable_overlap_scheduler=True) prompt = [", ".join(str(i) for i in range(1000))] From 7d7d05d8db045250bbf69a2d8d9ae5fc5a9f0857 Mon Sep 17 00:00:00 2001 From: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:05:19 -0500 Subject: [PATCH 041/172] [None][chore] Adding flaky auto scaling test to waives (#9851) Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 5bdf12de44..8c6a82ffb6 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -435,6 +435,7 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) +disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5726066) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644) From 4da31213635d7576e6dcdcbf3c21f848cb6a7de2 Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Tue, 9 Dec 2025 23:05:38 +0200 Subject: [PATCH 042/172] [#8921][chore] AutoDeploy NanoV3 to use SYMM_MEM allreduce strategy (#9797) Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- examples/auto_deploy/nano_v3.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/auto_deploy/nano_v3.yaml b/examples/auto_deploy/nano_v3.yaml index a847d9a8d4..be4cc4556c 100644 --- a/examples/auto_deploy/nano_v3.yaml +++ b/examples/auto_deploy/nano_v3.yaml @@ -13,8 +13,8 @@ kv_cache_config: enable_block_reuse: false transforms: detect_sharding: + allreduce_strategy: SYMM_MEM sharding_dims: ['ep', 'bmm'] - allreduce_strategy: 'AUTO' manual_config: head_dim: 128 tp_plan: From 5de4e3f621e3fbb636ae9f1be22ae6b856e43c91 Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Tue, 9 Dec 2025 13:34:09 -0800 Subject: [PATCH 043/172] [TRTINFRA-7328][infra] Consume SlurmCluster scratchPath and cleanup mounts (#9600) Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 57 ++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 2e3af6fa36..bc8ceaaf31 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -459,7 +459,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){ Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") def cleanupCommands = [ - "rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", + "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", "rm -rf ${jobWorkspace} || true", ].join(" && ") Utils.exec( @@ -510,7 +510,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St def entrypoint = SlurmConfig.containerRuntimeToEntrypoint[cluster.containerRuntime] def cleanupCommands = [ "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true", - "rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", + "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", ].join(" && ") Utils.exec( pipeline, @@ -565,12 +565,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))") - // Specific for OCI machines - def mounts = [ - "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro", - "/home/svc_tensorrt:/home/svc_tensorrt", - "/home/svc_tensorrt/.cache:/root/.cache" - ].join(",") + def mounts = getMountListForSlurmTest(cluster, false).join(",") def slurmSubmitOutput = Utils.exec( pipeline, timeout: false, @@ -822,6 +817,42 @@ def getPytestBaseCommandLine( return testCmdLine as String[] } +def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false) +{ + def mounts = [] + + // mounts for SLURM job submission and logs + if (useSbatch) { + mounts += [ + "/home/svc_tensorrt/bloom/scripts", + ] + } else { + mounts += [ + "/home/svc_tensorrt/bloom/scripts", + "/home/svc_tensorrt/slurm-logs", + ] + } + + // data/cache mounts + if (cluster.containerRuntime == ContainerRuntime.DOCKER) { + mounts += [ + "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro", + ] + } else if (cluster.containerRuntime == ContainerRuntime.ENROOT) { + if (!cluster.scratchPath) { + throw new Exception("Scratch path is not set for cluster: ${cluster.name}") + } + mounts += [ + "${cluster.scratchPath}:/scratch.trt_llm_data:ro", + "/home/svc_tensorrt/.cache:/root/.cache", + ] + } else { + throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}") + } + + return mounts +} + def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312") { SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition @@ -959,7 +990,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG // Generate Job Launch Script def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") - def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts" + def mounts = getMountListForSlurmTest(cluster, true).join(",") String[] taskArgs = getNodeArgs(nodeCount, gpuCount) if (taskArgs == null) { error "Invalid Slurm test stage name is set" @@ -971,13 +1002,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def containerImageArg = container def srunPrologue = "" if (cluster.containerRuntime == ContainerRuntime.ENROOT) { - mounts = [ - "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro", - "/home/svc_tensorrt/bloom/scripts", - "/home/svc_tensorrt/.cache:/root/.cache", - ].join(",") - - def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh" + def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh" containerImageArg = enrootImagePath srunPrologue = """ From ff0ef19ee9abc99e8c0df519c380d3d3bb003a0b Mon Sep 17 00:00:00 2001 From: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:51:46 -0500 Subject: [PATCH 044/172] [https://nvbugs/5688388][chore] Unwaiving fixed disagg test (#9800) Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8c6a82ffb6..5822fb9cbb 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -370,7 +370,6 @@ full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evi unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911) test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153) accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438) -disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5688388) accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721) unittest/_torch/speculative/test_eagle3.py::test_llama_eagle3[True-FLASHINFER-False-False-False-False-True-False-False] SKIP (https://nvbugs/5691246) test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) From 414448bb379d50a49b509de0207084a00aca8d0e Mon Sep 17 00:00:00 2001 From: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> Date: Tue, 9 Dec 2025 18:21:50 -0500 Subject: [PATCH 045/172] [https://nvbugs/5719561][chore] Unwaive tests for nvbug 5719561 (#9801) Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 5822fb9cbb..440e516112 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -415,18 +415,6 @@ unittest/_torch/speculative/test_spec_gate.py::test_spec_gate_e2e SKIP (https:// accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5569696) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/5715568) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5715568) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) -disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5719561) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) From 2d33ae94d532fe0a6be2578b1d76880c3c5e55d1 Mon Sep 17 00:00:00 2001 From: dhansen-nvidia <218031328+dhansen-nvidia@users.noreply.github.com> Date: Tue, 9 Dec 2025 18:51:31 -0500 Subject: [PATCH 046/172] =?UTF-8?q?[https://nvbugs/5508301][feat]=20Move?= =?UTF-8?q?=20D->H=20copies=20to=20a=20worker=20thread=20whe=E2=80=A6=20(#?= =?UTF-8?q?8463)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Dan Hansen <1+dhansen-nvidia@users.noreply.github.com> Signed-off-by: dhansen-nvidia <218031328+dhansen-nvidia@users.noreply.github.com> Co-authored-by: Dan Hansen <1+dhansen-nvidia@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/_util.py | 14 +- tensorrt_llm/_torch/pyexecutor/py_executor.py | 13 +- tensorrt_llm/_torch/pyexecutor/sampler.py | 159 +++++++++++++++--- tensorrt_llm/_utils.py | 44 +++++ tensorrt_llm/llmapi/llm_args.py | 9 + .../defs/accuracy/test_llm_api_pytorch.py | 26 ++- .../test_lists/qa/llm_digits_func.txt | 5 +- .../test_lists/qa/llm_function_core.txt | 32 ++-- .../qa/llm_function_core_sanity.txt | 31 ++-- .../test_lists/qa/llm_function_l20.txt | 8 +- .../test_lists/qa/llm_function_nim.txt | 26 +-- .../test_lists/test-db/l0_dgx_h100.yml | 25 +-- .../test_lists/test-db/l0_dgx_h200.yml | 46 ++--- .../test_lists/test-db/l0_h100.yml | 5 +- tests/integration/test_lists/waives.txt | 2 +- .../api_stability/references/llm.yaml | 4 + 16 files changed, 335 insertions(+), 114 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 86abfae483..385d4d52a1 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -8,7 +8,8 @@ import tensorrt_llm.bindings.executor as trtllm from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_utils import \ MODEL_CLASS_VISION_ENCODER_MAPPING -from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str +from tensorrt_llm._utils import (confidential_compute_enabled, + str_dtype_to_binding, torch_dtype_to_str) from tensorrt_llm.bindings.executor import DecodingMode from tensorrt_llm.llmapi.llm_args import (CacheTransceiverConfig, EagleDecodingConfig, KvCacheConfig, @@ -855,6 +856,7 @@ def create_torch_sampler_args( max_beam_width: int, disable_overlap_scheduler: bool, disable_flashinfer_sampling: bool, + enable_async_worker: bool, ): max_num_sequences = max_batch_size * mapping.pp_size max_draft_len = (0 if speculative_config is None else @@ -869,7 +871,8 @@ def create_torch_sampler_args( max_num_sequences=max_num_sequences, max_beam_width=max_beam_width, disable_flashinfer_sampling=disable_flashinfer_sampling, - disable_overlap_scheduler=disable_overlap_scheduler) + disable_overlap_scheduler=disable_overlap_scheduler, + enable_async_worker=enable_async_worker) def instantiate_sampler( @@ -886,6 +889,9 @@ def instantiate_sampler( kv_cache_config: KvCacheConfig, disable_flashinfer_sampling: bool, ): + enable_async_worker = (confidential_compute_enabled() + or llm_args.sampler_force_async_worker) + sampler_args = create_torch_sampler_args( mapping, max_seq_len=engine.max_seq_len, @@ -894,6 +900,7 @@ def instantiate_sampler( max_beam_width=max_beam_width, disable_overlap_scheduler=llm_args.disable_overlap_scheduler, disable_flashinfer_sampling=disable_flashinfer_sampling, + enable_async_worker=enable_async_worker, ) decoding_mode = get_decoding_mode(decoding_config=decoding_config, max_beam_width=max_beam_width) @@ -920,7 +927,8 @@ def instantiate_sampler( max_batch_size=max_batch_size, max_beam_width=max_beam_width, decoding_config=decoding_config, - kv_cache_config=kv_cache_config) + kv_cache_config=kv_cache_config, + enable_async_worker=enable_async_worker) if not engine.model.model_config.is_generation: # NOTE: choose sampler based on model type return EarlyStopSampler() diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 316d23bf2c..5459c62559 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -52,7 +52,8 @@ from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState, LlmResponse, get_draft_token_length) from .model_engine import ModelEngine from .resource_manager import ResourceManager -from .sampler import Sampler, SampleState, SampleStateTensors +from .sampler import (AsyncWorkerMixin, Sampler, SamplerEvent, SampleState, + SampleStateTensors) from .scheduler import (RequestScheduler, ScheduledRequests, SerializableSchedulerOutput) @@ -369,6 +370,10 @@ class PyExecutor: target=self._event_loop_wrapper, daemon=True) self.worker_thread.start() self.worker_started = True + # Start the sampler's async worker, if it is enabled + if (isinstance(self.sampler, AsyncWorkerMixin) + and self.sampler.async_worker_enabled()): + self.sampler.async_worker_start() def _set_global_steady_clock_offset(self): assert self.global_rank >= 0, "rank should be >= 0" @@ -461,6 +466,10 @@ class PyExecutor: keys = list(self.virtual_memory_pools.keys()) for key in keys: del self.virtual_memory_pools[key] + # Stop the sampler's async worker, if it was used + if (isinstance(self.sampler, AsyncWorkerMixin) + and self.sampler.async_worker_enabled()): + self.sampler.async_worker_stop() def can_enqueue_requests(self) -> bool: """ @@ -1706,7 +1715,7 @@ class PyExecutor: self._update_request_states(scheduled_batch) return self.sampler.SampleState( scheduled_requests=scheduled_batch, - sampler_event=sampler_event, + sampler_event=SamplerEvent(cuda_event=sampler_event), ) def _validate_request(self, request: LlmRequest): diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 09d69bb126..11519f6aa5 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -17,6 +17,7 @@ import sys from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Iterable +from concurrent import futures from dataclasses import dataclass from functools import cached_property from itertools import repeat @@ -96,6 +97,17 @@ class SampleStateTensors: return vars(self).values() +@dataclass(kw_only=True) +class SamplerEvent: + cuda_event: torch.cuda.Event + worker_futures: Optional[list[futures.Future[Any]]] = None + + def synchronize(self): + if self.worker_futures: + futures.wait(self.worker_futures) + self.cuda_event.synchronize() + + @dataclass(kw_only=True) class SampleState: scheduled_requests: ScheduledRequests @@ -103,7 +115,7 @@ class SampleState: device: Optional[SampleStateTensors] = None host: Optional[SampleStateTensors] = None - sampler_event: Optional[torch.cuda.Event] = None + sampler_event: Optional[SamplerEvent] = None class Sampler(ABC): @@ -688,7 +700,108 @@ class SampleStateTorch(SampleState): beam_histories: list[BeamHistory | None] | None = None -class TorchSampler(Sampler): +class AsyncWorkerMixin: + """ + Mixin that adds the ability to fork off operations to run on a worker + thread (particularly D2H copies). If the async worker isn't active, + operations will seamlessly run on the main thread. + """ + + MAX_WORKERS = 1 + + def _async_worker_active(self) -> bool: + return getattr(self, "_async_worker", None) is not None + + def _async_worker_init(self, enable_async_worker: bool): + self._enable_async_worker = enable_async_worker + self._async_worker = None + self._async_worker_futures: list[futures.Future[any]] = [] + + def async_worker_enabled(self): + return getattr(self, "_enable_async_worker", False) + + def async_worker_start(self): + assert self.async_worker_enabled() + if not self._async_worker_active(): + + def _async_worker_initializer(device_id): + # The current device is set per thread, so we need to set it + # again here + torch.cuda.set_device(device_id) + # Submit the host copies in a separate stream to prevent the + # blocking copies from gating subsequent async work + torch.cuda.set_stream(torch.cuda.Stream()) + + self._async_worker = futures.ThreadPoolExecutor( + max_workers=self.MAX_WORKERS, + initializer=_async_worker_initializer, + initargs=(torch.cuda.current_device(),), + ) + + def async_worker_stop(self): + assert self.async_worker_enabled() + if self._async_worker_active(): + self._async_worker.shutdown(wait=True) + self._async_worker = None + + @torch.inference_mode() + def _async_copy_to_host( + self, copy_ready: torch.cuda.Event, dest: torch.Tensor, src: torch.Tensor + ): + # Make sure the async work takes place after all prior operations on + # the primary stream. synchronize() is intentionally chosen instead of + # wait() here; otherwise, blocking copies will stall subsequent CUDA + # API calls on the main stream/thread + copy_ready.synchronize() + + # Note that the omission of non_blocking=True here is intentional; Work + # submitted to the async worker is expected to block at the end, + # consistent with the semantics of futures + dest.copy_(src) + + def _copy_to_host(self, src: torch.Tensor) -> torch.Tensor: + dest = torch.empty_like(src, device="cpu", pin_memory=True) + if self._async_worker_active(): + # Create a snapshot of the source on the main stream, so as to + # guarantee that the tensor data hasn't been modified before the + # copy. This precaution is only needed because the copy will + # execute on a side stream and thus there is no guarantee that + # future operations on the main stream won't race to modify the + # tensor data before we copy it. + src_snapshot = src.clone() + + # Record an event on the main thread/stream that we will + # synchronize with on the worker thread/stream + copy_ready = torch.cuda.Event() + copy_ready.record() + + # Submit the copy to the async worker thread + result = self._async_worker.submit( + self._async_copy_to_host, copy_ready, dest, src_snapshot + ) + + # Save the future, so that we can await it later + self._async_worker_futures.append(result) + else: + # If the async worker is not in use, just copy as usual + dest.copy_(src, non_blocking=True) + return dest + + def _record_sampler_event(self) -> SamplerEvent: + cuda_event = torch.cuda.Event() + cuda_event.record() + + # Transfer ownership to worker_futures and re-initialize + if self._async_worker_active(): + worker_futures = self._async_worker_futures + self._async_worker_futures = [] + else: + worker_futures = None + + return SamplerEvent(cuda_event=cuda_event, worker_futures=worker_futures) + + +class TorchSampler(Sampler, AsyncWorkerMixin): SampleState = SampleStateTorch @override @@ -768,6 +881,7 @@ class TorchSampler(Sampler): max_total_draft_tokens: int disable_overlap_scheduler: bool = False disable_flashinfer_sampling: bool = False + enable_async_worker: bool = False def __init__(self, args: Args): self.max_seq_len = args.max_seq_len @@ -820,6 +934,8 @@ class TorchSampler(Sampler): # Force number of accepted tokens for speculative decoding testing self._force_num_accepted_tokens = get_force_num_accepted_tokens() + self._async_worker_init(args.enable_async_worker) + def get_generator(self, device: torch.device) -> torch.Generator: """Get a deterministic generator for the specified device. @@ -1729,22 +1845,19 @@ class TorchSampler(Sampler): first_finish_reasons=first_finish_reasons, predecessor_beams=self.store.predecessor_beams, ) - finish_reasons_host = finish_reasons.to(device="cpu", non_blocking=True) + finish_reasons_host = self._copy_to_host(finish_reasons) beam_histories = [None] * len(requests) if self._use_beam_search: assert seq_lens_host is not None, "seq_lens is required for beam search" seq_lens = seq_lens_host.to(device="cuda", non_blocking=True) - first_finish_reasons_host = self.store.first_finish_reasons.to( - device="cpu", non_blocking=True - ) + first_finish_reasons_host = self._copy_to_host(self.store.first_finish_reasons) self._update_original_tokens(seq_slots, seq_lens, new_tokens) self._maybe_create_beam_histories( requests, finish_reasons=first_finish_reasons, beam_histories=beam_histories ) - sampler_event = torch.cuda.Event() - sampler_event.record() + sampler_event = self._record_sampler_event() return SampleStateTorch( scheduled_requests=scheduled_requests, device=SampleStateTensors(new_tokens=new_tokens), @@ -1881,10 +1994,8 @@ class TorchSampler(Sampler): logprobs_cuda, k=max(req.py_num_logprobs for req in requests), dim=-1 ) # Use a single D2H copy to reduce overheads - topk_vals = torch.empty_like(topk_vals_cuda, device="cpu", pin_memory=True) - topk_indices = torch.empty_like(topk_indices_cuda, device="cpu", pin_memory=True) - topk_vals.copy_(topk_vals_cuda, non_blocking=True) - topk_indices.copy_(topk_indices_cuda, non_blocking=True) + topk_vals = self._copy_to_host(topk_vals_cuda) + topk_indices = self._copy_to_host(topk_indices_cuda) current_offset = 0 for req_id, steps in zip( logprobs_req_indices, req_num_generated_tokens[logprobs_req_indices].tolist() @@ -2090,7 +2201,7 @@ class TorchSampler(Sampler): new_tokens_cuda.view(-1, *new_tokens_cuda.shape[2:]).scatter_( 0, batch_dest_indices_1d_cuda, batch_next_tokens_cuda_int ) - new_tokens_host = new_tokens_cuda.to("cpu", non_blocking=True) + new_tokens_host = self._copy_to_host(new_tokens_cuda) return new_tokens_host @@ -2604,7 +2715,7 @@ class SampleStateTRTLLM(SampleState): host: Optional[SampleStateTensorsHostTRTLLM] = None -class TRTLLMSampler(Sampler): +class TRTLLMSampler(Sampler, AsyncWorkerMixin): MAX_DECODING_TOKENS = 1 # It must be 1 when not in speculative decoding SampleState = SampleStateTRTLLM @@ -2624,6 +2735,7 @@ class TRTLLMSampler(Sampler): max_beam_width: int, decoding_config: Optional[DecodingConfig] = None, kv_cache_config: Optional[KvCacheConfig] = None, + enable_async_worker: bool = False, ): vocab_size = model.config.vocab_size num_hidden_layers = model.config.num_hidden_layers @@ -2674,6 +2786,8 @@ class TRTLLMSampler(Sampler): self._initialize_store() self._instantiate_algorithms() + self._async_worker_init(enable_async_worker) + def _initialize_store(self): torch_stream = torch.cuda.current_stream().cuda_stream cuda_stream = CudaStream(torch_stream) @@ -2831,17 +2945,17 @@ class TRTLLMSampler(Sampler): finalize_events[request.request_id] = self._finalize_request(request, False) elif request.streaming: finalize_events[request.request_id] = self._finalize_request(request, True) - gathered_ids = self.store["decoder_state"].gathered_ids.to("cpu", non_blocking=True) - new_output_tokens = self.store["decoder_state"].all_new_tokens.to("cpu", non_blocking=True) - finished_sum = self.store["decoder_state"].finished_sum.to("cpu", non_blocking=True) - finish_reasons = self.store["decoder_state"].finish_reasons.to("cpu", non_blocking=True) - sequence_lengths = self.store["decoder_state"].sequence_lengths.to("cpu", non_blocking=True) + gathered_ids = self._copy_to_host(self.store["decoder_state"].gathered_ids) + new_output_tokens = self._copy_to_host(self.store["decoder_state"].all_new_tokens) + finished_sum = self._copy_to_host(self.store["decoder_state"].finished_sum) + finish_reasons = self._copy_to_host(self.store["decoder_state"].finish_reasons) + sequence_lengths = self._copy_to_host(self.store["decoder_state"].sequence_lengths) log_probs = None cum_log_probs = None if any(request.py_return_log_probs for request in scheduled_requests.all_requests()): - log_probs = self.store["decoder_state"].log_probs.to("cpu", non_blocking=True) - cum_log_probs = self.store["decoder_state"].cum_log_probs.to("cpu", non_blocking=True) + log_probs = self._copy_to_host(self.store["decoder_state"].log_probs) + cum_log_probs = self._copy_to_host(self.store["decoder_state"].cum_log_probs) device = SampleStateTensors(new_tokens=self.store["decoder_state"].all_new_tokens) @@ -2855,8 +2969,7 @@ class TRTLLMSampler(Sampler): gathered_ids=gathered_ids, ) - sampler_event = torch.cuda.Event() - sampler_event.record() + sampler_event = self._record_sampler_event() self.micro_batch_idx = (self.micro_batch_idx + 1) % self.num_micro_batches diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 68229e4150..cdcd012bd3 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -1168,6 +1168,50 @@ def set_prometheus_multiproc_dir() -> object: f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}") +def confidential_compute_enabled() -> bool: + """ + Query NVML for the confidential compute state + """ + + cc_enabled = False + + try: + # Init + import pynvml + pynvml.nvmlInit() + + # Hopper and newer supports a more nuanced query of confidential + # compute settings + cc_settings = pynvml.c_nvmlSystemConfComputeSettings_v1_t() + if (pynvml.nvmlSystemGetConfComputeSettings(cc_settings) == + pynvml.NVML_SUCCESS): + cc_enabled = (cc_settings.ccFeature + == pynvml.NVML_CC_SYSTEM_FEATURE_ENABLED + or cc_settings.multiGpuMode + == pynvml.NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE + or cc_settings.multiGpuMode + == pynvml.NVML_CC_SYSTEM_MULTIGPU_NVLE) + except pynvml.NVMLError_NotSupported: + # Simple query for older GPUs + try: + cc_state = pynvml.nvmlSystemGetConfComputeState() + cc_enabled = ( + cc_state.ccFeature == pynvml.NVML_CC_SYSTEM_FEATURE_ENABLED) + except Exception as e: + logger.error(f"Error querying confidential compute state: {str(e)}") + except Exception as e: + logger.error(f"Error querying confidential compute state: {str(e)}") + finally: + # Shutdown + try: + pynvml.nvmlShutdown() + except: + # Ignore shutdown errors + pass + + return cc_enabled + + P = ParamSpec("P") diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 9f154c53f6..8627c883b9 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2609,6 +2609,15 @@ class TorchLlmArgs(BaseLlmArgs): "The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. Defaults to auto, which will use TorchSampler unless BeamSearch is requested.", status="beta") + sampler_force_async_worker: bool = Field( + default=False, + description="Force usage of the async worker in the sampler for D2H " + "copies, even if confidential compute is not active. Normally, the " + "async worker should only be used when confidential compute is active. " + "This argument is provided to enable it for testing purposes, " + "irrespective of confidential compute state.", + status="prototype") + enable_iter_perf_stats: bool = Field( default=False, description="Enable iteration performance statistics.", diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 590f4d92c2..8cf33c5f12 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -264,10 +264,13 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): @skip_pre_hopper @parametrize_with_ids("overlap_scheduler", [True, False]) @parametrize_with_ids("eagle3_one_model", [True, False]) - def test_eagle3(self, overlap_scheduler, eagle3_one_model): + @parametrize_with_ids("sampler_async_worker", [True, False]) + def test_eagle3(self, overlap_scheduler, eagle3_one_model, + sampler_async_worker): pytorch_config = dict( max_batch_size= 1, # add max_batch_size to avoid error in overlap scheduler + sampler_force_async_worker=sampler_async_worker, disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig(max_batch_size=1, enable_padding=True), @@ -431,6 +434,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @parametrize_with_ids("sampler_async_worker", [True, False]) @parametrize_with_ids("disable_overlap_scheduler", [False, True]) @parametrize_with_ids( "enable_cuda_graph,enable_padding", @@ -440,7 +444,8 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): (True, True), # CUDA Graph with padding ]) def test_auto_dtype_beam_search(self, enable_cuda_graph, enable_padding, - disable_overlap_scheduler): + disable_overlap_scheduler, + sampler_async_worker): max_beam_width = 2 sampling_params = SamplingParams(n=max_beam_width, best_of=max_beam_width, @@ -465,6 +470,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): max_batch_size=max_beam_width, max_seq_len=2048, max_beam_width=max_beam_width, + sampler_force_async_worker=sampler_async_worker, disable_overlap_scheduler=disable_overlap_scheduler, cuda_graph_config=cuda_graph_config, ) as llm: @@ -474,6 +480,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): extra_acc_spec="beam_width=2") @skip_pre_hopper + @parametrize_with_ids("sampler_async_worker", [True, False]) @parametrize_with_ids("disable_overlap_scheduler", [False, True]) @parametrize_with_ids( "enable_cuda_graph,enable_padding", @@ -483,7 +490,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): (True, True), # CUDA Graph with padding ]) def test_fp8_beam_search(self, enable_cuda_graph, enable_padding, - disable_overlap_scheduler): + disable_overlap_scheduler, sampler_async_worker): model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" max_beam_width = 2 sampling_params = SamplingParams(n=max_beam_width, @@ -509,6 +516,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): max_seq_len=2048, max_beam_width=max_beam_width, disable_overlap_scheduler=disable_overlap_scheduler, + sampler_force_async_worker=sampler_async_worker, cuda_graph_config=cuda_graph_config, ) @@ -539,14 +547,17 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness): @skip_pre_hopper @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("sampler_async_worker", [True, False]) @pytest.mark.parametrize("disable_overlap_scheduler", [True, False]) @pytest.mark.parametrize("pp_size", [2, 4], ids=["pp2", "pp4"]) - def test_return_logits_pp(self, pp_size, disable_overlap_scheduler): + def test_return_logits_pp(self, pp_size, disable_overlap_scheduler, + sampler_async_worker): prompts = ["A B C"] llm = LLM(model=self.MODEL_PATH, pipeline_parallel_size=pp_size, - disable_overlap_scheduler=disable_overlap_scheduler) + disable_overlap_scheduler=disable_overlap_scheduler, + sampler_force_async_worker=sampler_async_worker) sampling_params = SamplingParams(max_tokens=8, return_context_logits=True, @@ -1559,6 +1570,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(4) @skip_pre_hopper @skip_ray + @parametrize_with_ids("sampler_async_worker", [True, False]) @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler", [(False, False, False, False), @@ -1574,7 +1586,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): ids=["tp4", "ep4", "tp2pp2", "pp4"]) def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, - overlap_scheduler, torch_compile): + overlap_scheduler, torch_compile, + sampler_async_worker): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, @@ -1587,6 +1600,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): torch_compile_config=torch_compile_config, moe_config=MoeConfig( backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS"), + sampler_force_async_worker=sampler_async_worker, ) if fp8kv: diff --git a/tests/integration/test_lists/qa/llm_digits_func.txt b/tests/integration/test_lists/qa/llm_digits_func.txt index 8cfe98fe11..30e3f22384 100644 --- a/tests/integration/test_lists/qa/llm_digits_func.txt +++ b/tests/integration/test_lists/qa/llm_digits_func.txt @@ -16,8 +16,9 @@ test_e2e.py::test_ptp_quickstart_advanced[Mistral-Nemo-12b-Base-Mistral-Nemo-Bas test_e2e.py::test_ptp_quickstart_advanced[DeepSeek-R1-Distill-Qwen-32B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 7e58d0f500..71b73a530f 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -390,8 +390,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] @@ -403,18 +404,20 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 @@ -583,6 +586,7 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index f468d262b1..228d748e45 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -120,26 +120,29 @@ accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype diff --git a/tests/integration/test_lists/qa/llm_function_l20.txt b/tests/integration/test_lists/qa/llm_function_l20.txt index 5015e7ee15..772e39a683 100644 --- a/tests/integration/test_lists/qa/llm_function_l20.txt +++ b/tests/integration/test_lists/qa/llm_function_l20.txt @@ -19,9 +19,11 @@ accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 357cc80a05..515957f2b3 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -211,18 +211,20 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 0eca7d4847..530b9cf5f1 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -152,17 +152,20 @@ l0_dgx_h100: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype1] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype0] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype1] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index d9e80819e9..c4d42214ae 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -51,27 +51,31 @@ l0_dgx_h200: stage: post_merge backend: pytorch tests: - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 42859c06ec..f3dc84e81e 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -59,8 +59,9 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 440e516112..5b75702332 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -375,7 +375,7 @@ unittest/_torch/speculative/test_eagle3.py::test_llama_eagle3[True-FLASHINFER-Fa test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) examples/test_ray.py::test_llm_inference_distributed_ray[tep2] SKIP (https://nvbugs/5701457) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5701445) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/5701445) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5666821) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5666821) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5666821) diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index 6d02fed397..e316d45a81 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -127,6 +127,10 @@ methods: annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType] default: auto status: beta + sampler_force_async_worker: + annotation: bool + default: False + status: prototype enable_iter_perf_stats: annotation: bool default: False From 36c9e7cfe670db782d69f37bcc772baaa5c86ff1 Mon Sep 17 00:00:00 2001 From: zhanghaotong Date: Wed, 10 Dec 2025 10:34:08 +0800 Subject: [PATCH 047/172] [None][chore] Add unittest for otlp tracing (#8716) Signed-off-by: zhanghaotong Signed-off-by: Shunkang <182541032+Shunkangz@users.noreply.github.co> --- ATTRIBUTIONS-Python.md | 29727 +++++++++++++--- requirements-dev.txt | 4 + .../integration/test_lists/test-db/l0_a10.yml | 1 + tests/unittest/others/test_tracing.py | 204 + 4 files changed, 24124 insertions(+), 5812 deletions(-) create mode 100644 tests/unittest/others/test_tracing.py diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index 4e350512a2..3cff1f7398 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -21,7 +21,225 @@ This project uses the following third-party libraries. Each library is open-sour This file is automatically generated. Please do not edit it directly. -## accelerate (1.10.1) +## absl-py (2.3.1) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Changelog`: https://github.com/abseil/abseil-py/blob/main/CHANGELOG.md + - `Documentation`: https://abseil.io/docs/python/ + - `Issues`: https://github.com/abseil/abseil-py/issues + - `Source`: https://github.com/abseil/abseil-py + + +## accelerate (1.12.0) ### Licenses License: `Apache` @@ -575,7 +793,7 @@ PERFORMANCE OF THIS SOFTWARE. - `Repository`: https://github.com/aio-libs/aiohappyeyeballs -## aiohttp (3.13.0) +## aiohttp (3.13.2) ### Licenses License: `Apache-2.0 AND MIT` @@ -855,6 +1073,42 @@ Apache License - `Homepage`: https://github.com/aio-libs/aiosignal +## alembic (1.17.2) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Copyright 2009-2025 Michael Bayer. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://alembic.sqlalchemy.org/en/latest/changelog.html + - `Documentation`: https://alembic.sqlalchemy.org/en/latest/ + - `Homepage`: https://alembic.sqlalchemy.org + - `Issue Tracker`: https://github.com/sqlalchemy/alembic/issues/ + - `Source`: https://github.com/sqlalchemy/alembic/ + + ## annotated-types (0.7.0) ### Licenses @@ -1009,6 +1263,197 @@ THE SOFTWARE. - `Repository`: https://github.com/litl/backoff +## bandit (1.7.7) + +### Licenses +License: `Apache-2.0 license` + + - `LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. +``` + +### URLs + - `Homepage`: https://bandit.readthedocs.io/ + - `Issue Tracker`: https://github.com/PyCQA/bandit/issues + - `Release Notes`: https://github.com/PyCQA/bandit/releases + - `Source Code`: https://github.com/PyCQA/bandit + + ## blake3 (1.0.8) ### Licenses @@ -1425,7 +1870,7 @@ DEALINGS IN THE SOFTWARE. - `source`: https://github.com/pypa/build -## certifi (2025.10.5) +## certifi (2025.11.12) ### Licenses License: `MPL-2.0` @@ -1469,24 +1914,24 @@ License: `MIT` Except when otherwise stated (look for LICENSE files in directories or information at the beginning of each file) all software and -documentation is licensed as follows: +documentation is licensed as follows: MIT No Attribution - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, - copy, modify, merge, publish, distribute, sublicense, and/or - sell copies of the Software, and to permit persons to whom the + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is furnished to do so. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ``` @@ -1500,6 +1945,556 @@ documentation is licensed as follows: - `Source Code`: https://github.com/python-cffi/cffi +## cfgv (3.5.0) + +### Licenses +License: `MIT` + + - `LICENSE`: +``` +Copyright (c) 2018 Anthony Sottile + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/asottile/cfgv + + +## chardet (5.2.0) + +### Licenses +License: `LGPL` + + - `LICENSE`: +``` + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! +``` + +### URLs + - `Documentation`: https://chardet.readthedocs.io/ + - `GitHub Project`: https://github.com/chardet/chardet + - `Homepage`: https://github.com/chardet/chardet + - `Issue Tracker`: https://github.com/chardet/chardet/issues + + ## charset-normalizer (3.4.4) ### Licenses @@ -1537,42 +2532,79 @@ SOFTWARE. - `Issue tracker`: https://github.com/jawah/charset_normalizer/issues -## click (8.3.0) +## choreographer (1.2.1) + +### Licenses +License: `# MIT License` + + - `licenses/LICENSE.md`: +``` +# MIT License + +Copyright (c) Plotly, Inc. + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/plotly/choreographer + - `Repository`: https://github.com/plotly/choreographer + + +## click (8.3.1) ### Licenses License: `BSD-3-Clause` - - `licenses/LICENSE`: + - `licenses/LICENSE.txt`: ``` -BSD 3-Clause License - -Copyright (c) 2019, Eugene Prilepin -All rights reserved. +Copyright 2014 Pallets Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: +modification, are permitted provided that the following conditions are +met: -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` ### URLs @@ -1628,6 +2660,91 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Issues`: https://github.com/click-contrib/click-option-group/issues +## cloudpickle (3.1.2) + +### Licenses +License: `BSD-3-Clause` + + - `licenses/LICENSE`: +``` +This module was extracted from the `cloud` package, developed by +PiCloud, Inc. + +Copyright (c) 2015, Cloudpickle contributors. +Copyright (c) 2012, Regents of the University of California. +Copyright (c) 2009 PiCloud, Inc. http://www.picloud.com. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the University of California, Berkeley nor the + names of its contributors may be used to endorse or promote + products derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Homepage`: https://github.com/cloudpipe/cloudpickle + + +## colorama (0.4.6) + +### Licenses +License: `BSD License` + + - `licenses/LICENSE.txt`: +``` +Copyright (c) 2010 Jonathan Hartley +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holders, nor those of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Homepage`: https://github.com/tartley/colorama + + ## colored (2.3.1) ### Licenses @@ -1639,6 +2756,39 @@ MIT License Copyright 2014-2025 Dimitris Zlatanidis +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `homepage`: https://dslackw.gitlab.io/colored/ + + +## colorlog (6.10.1) + +### Licenses +License: `MIT License` + + - `licenses/LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2012-2021 Sam Clements + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to @@ -1658,7 +2808,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ``` ### URLs - - `homepage`: https://dslackw.gitlab.io/colored/ + - `Homepage`: https://github.com/borntyping/python-colorlog ## contourpy (1.3.3) @@ -1706,7 +2856,238 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Repository`: https://github.com/contourpy/contourpy -## cuda-bindings (13.0.2) +## coverage (7.12.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE.txt`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS +``` + +### URLs + - `Documentation`: https://coverage.readthedocs.io/en/7.12.0 + - `Funding`: https://tidelift.com/subscription/pkg/pypi-coverage?utm_source=pypi-coverage&utm_medium=referral&utm_campaign=pypi + - `Homepage`: https://github.com/coveragepy/coveragepy + - `Issues`: https://github.com/coveragepy/coveragepy/issues + - `Mastodon`: https://hachyderm.io/@coveragepy + - `Mastodon (nedbat)`: https://hachyderm.io/@nedbat + + +## cramjam (2.11.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2020 Miles Granger + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `documentation`: https://docs.rs/cramjam/latest/cramjam + - `homepage`: https://github.com/milesgranger/pyrus-cramjam + - `repository`: https://github.com/milesgranger/pyrus-cramjam + + +## cuda-bindings (13.0.3) ### Licenses License: `LicenseRef-NVIDIA-SOFTWARE-LICENSE` @@ -1768,7 +3149,7 @@ g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, - `Repository`: https://github.com/NVIDIA/cuda-python -## cuda-pathfinder (1.3.1) +## cuda-pathfinder (1.3.2) ### Licenses License: `Apache-2.0` @@ -1959,7 +3340,7 @@ License: `Apache-2.0` - `Repository`: https://github.com/NVIDIA/cuda-python -## cuda-python (13.0.2) +## cuda-python (13.0.3) ### Licenses License: `LicenseRef-NVIDIA-SOFTWARE-LICENSE` @@ -2023,6 +3404,16 @@ g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, - `repository`: https://github.com/NVIDIA/cuda-python/ +## cuda-toolkit (13.0.0) + +### Licenses +License: `None` + +### URLs + - `documentation`: https://docs.nvidia.com/cuda/ + - `homepage`: https://developer.nvidia.com/cuda-toolkit + + ## cycler (0.12.1) ### Licenses @@ -2063,6 +3454,43 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.``` - `repository`: https://github.com/matplotlib/cycler +## DataProperty (1.1.0) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2016-2024 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/DataProperty/releases + - `Homepage`: https://github.com/thombashi/DataProperty + - `Source`: https://github.com/thombashi/DataProperty + - `Tracker`: https://github.com/thombashi/DataProperty/issues + + ## datasets (3.1.0) ### Licenses @@ -2545,6 +3973,306 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source Code`: https://github.com/uqfoundation/dill +## distlib (0.4.0) + +### Licenses +License: `PSF-2.0` + + - `LICENSE.txt`: +``` +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations (now Zope +Corporation, see http://www.zope.com). In 2001, the Python Software +Foundation (PSF, see http://www.python.org/psf/) was formed, a +non-profit organization created specifically to own Python-related +Intellectual Property. Zope Corporation is a sponsoring member of +the PSF. + +All Python releases are Open Source (see http://www.opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.2 2.1.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2.1 2.2 2002 PSF yes + 2.2.2 2.2.1 2002 PSF yes + 2.2.3 2.2.2 2003 PSF yes + 2.3 2.2.2 2002-2003 PSF yes + 2.3.1 2.3 2002-2003 PSF yes + 2.3.2 2.3.1 2002-2003 PSF yes + 2.3.3 2.3.2 2002-2003 PSF yes + 2.3.4 2.3.3 2004 PSF yes + 2.3.5 2.3.4 2005 PSF yes + 2.4 2.3 2004 PSF yes + 2.4.1 2.4 2005 PSF yes + 2.4.2 2.4.1 2005 PSF yes + 2.4.3 2.4.2 2006 PSF yes + 2.4.4 2.4.3 2006 PSF yes + 2.5 2.4 2006 PSF yes + 2.5.1 2.5 2007 PSF yes + 2.5.2 2.5.1 2008 PSF yes + 2.5.3 2.5.2 2008 PSF yes + 2.6 2.5 2008 PSF yes + 2.6.1 2.6 2008 PSF yes + 2.6.2 2.6.1 2009 PSF yes + 2.6.3 2.6.2 2009 PSF yes + 2.6.4 2.6.3 2009 PSF yes + 2.6.5 2.6.4 2010 PSF yes + 3.0 2.6 2008 PSF yes + 3.0.1 3.0 2009 PSF yes + 3.1 3.0.1 2009 PSF yes + 3.1.1 3.1 2009 PSF yes + 3.1.2 3.1 2010 PSF yes + 3.2 3.1 2010 PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 +Python Software Foundation; All Rights Reserved" are retained in Python alone or +in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the Internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the Internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +``` + +### URLs + - `Documentation`: https://distlib.readthedocs.io/ + - `Homepage`: https://github.com/pypa/distlib + - `Source`: https://github.com/pypa/distlib + - `Tracker`: https://github.com/pypa/distlib/issues + + ## distro (1.9.0) ### Licenses @@ -2760,6 +4488,42 @@ Apache License - `Homepage`: https://github.com/python-distro/distro +## docstring_parser (0.17.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE.md`: +``` +The MIT License (MIT) + +Copyright (c) 2018 Marcin Kurczewski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `changelog`: https://github.com/rr-/docstring_parser/blob/master/CHANGELOG.md + - `homepage`: https://github.com/rr-/docstring_parser + - `repository`: https://github.com/rr-/docstring_parser + + ## einops (0.8.1) ### Licenses @@ -3199,6 +4963,37 @@ License: `Apache 2.0` - `Homepage`: https://github.com/huggingface/evaluate +## execnet (2.1.2) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +``` + +### URLs + - `Homepage`: https://execnet.readthedocs.io/en/latest/ + + ## fastapi (0.117.1) ### Licenses @@ -3237,6 +5032,195 @@ THE SOFTWARE. - `Repository`: https://github.com/fastapi/fastapi +## fastparquet (2024.11.0) + +### Licenses +License: `Apache License 2.0` + + - `LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS``` + +### URLs + - `Homepage`: https://github.com/dask/fastparquet/ + + ## filelock (3.20.0) ### Licenses @@ -3513,35 +5497,6 @@ MIT License 3rdparty/spdlog 3rdparty/spdlog/include/spdlog/fmt/bundled (fmt library) -``` - - - `licenses/licenses/LICENSE.spdlog.txt`: -``` -The MIT License (MIT) - -Copyright (c) 2016 Gabi Melman. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - --- NOTE: Third party dependency used by this software -- -This software depends on the fmt lib (MIT License), -and users must comply to its license: https://raw.githubusercontent.com/fmtlib/fmt/master/LICENSE ``` - `licenses/licenses/LICENSE.fmt.txt`: @@ -3573,6 +5528,66 @@ As an exception, if, as a result of your compiling your source code, portions of this Software are embedded into a machine-executable object form of such source code, you may redistribute such embedded portions in such object form without including the above copyright and permission notices. +``` + + - `licenses/licenses/LICENSE.cutlass.txt`: +``` +Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: BSD-3-Clause + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `licenses/licenses/LICENSE.spdlog.txt`: +``` +The MIT License (MIT) + +Copyright (c) 2016 Gabi Melman. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +-- NOTE: Third party dependency used by this software -- +This software depends on the fmt lib (MIT License), +and users must comply to its license: https://raw.githubusercontent.com/fmtlib/fmt/master/LICENSE ``` - `licenses/licenses/LICENSE.flashattention3.txt`: @@ -3596,37 +5611,6 @@ modification, are permitted provided that the following conditions are met: contributors may be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -``` - - - `licenses/licenses/LICENSE.cutlass.txt`: -``` -Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -SPDX-License-Identifier: BSD-3-Clause - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -3648,6 +5632,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ### Licenses License: `MIT` + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2017 Just van Rossum + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + - `licenses/LICENSE.external`: ``` FontTools includes the following font projects for testing purposes, which are @@ -4031,31 +6040,6 @@ furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -``` - - - `licenses/LICENSE`: -``` -MIT License - -Copyright (c) 2017 Just van Rossum - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -4336,7 +6320,401 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Homepage`: https://github.com/fsspec/filesystem_spec -## grpcio (1.75.1) +## genai-perf (0.0.13) + +### Licenses +License: `BSD` + + - `licenses/LICENSE`: +``` +BSD 3-Clause License + +Copyright (c) 2024, Triton Inference Server + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Bug Tracker`: https://github.com/triton-inference-server/perf_analyzer/issues + - `Homepage`: https://github.com/triton-inference-server/perf_analyzer + + +## googleapis-common-protos (1.72.0) + +### Licenses +License: `Apache 2.0` + + - `licenses/LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Repository`: https://github.com/googleapis/google-cloud-python/tree/main/packages/googleapis-common-protos + + +## graphviz (0.21) + +### Licenses +License: `MIT` + + - `licenses/LICENSE.txt`: +``` +The MIT License (MIT) + +Copyright (c) 2013-2025 Sebastian Bank + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `CI`: https://github.com/xflr6/graphviz/actions + - `Changelog`: https://graphviz.readthedocs.io/en/latest/changelog.html + - `Coverage`: https://codecov.io/gh/xflr6/graphviz + - `Documentation`: https://graphviz.readthedocs.io + - `Homepage`: https://github.com/xflr6/graphviz + - `Issue Tracker`: https://github.com/xflr6/graphviz/issues + + +## greenlet (3.2.4) + +### Licenses +License: `MIT AND Python-2.0` + + - `licenses/LICENSE.PSF`: +``` +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011 Python Software Foundation; All Rights Reserved" are retained in Python +alone or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. +``` + + - `licenses/LICENSE`: +``` +The following files are derived from Stackless Python and are subject to the +same license as Stackless Python: + + src/greenlet/slp_platformselect.h + files in src/greenlet/platform/ directory + +See LICENSE.PSF and http://www.stackless.com/ for details. + +Unless otherwise noted, the files in greenlet have been released under the +following MIT license: + +Copyright (c) Armin Rigo, Christian Tismer and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `Bug Tracker`: https://github.com/python-greenlet/greenlet/issues + - `Changes`: https://greenlet.readthedocs.io/en/latest/changes.html + - `Documentation`: https://greenlet.readthedocs.io/ + - `Homepage`: https://greenlet.readthedocs.io/ + - `Source Code`: https://github.com/python-greenlet/greenlet/ + + +## grpcio (1.76.0) ### Licenses License: `Apache License 2.0` @@ -4617,7 +6995,7 @@ Mozilla Public License Version 2.0 means any form of the work other than Source Code Form. 1.7. "Larger Work" - means a work that combines Covered Software with other material, in + means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. 1.8. "License" @@ -5044,7 +7422,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source`: https://github.com/h5py/h5py -## hf-xet (1.1.10) +## hf-xet (1.2.0) ### Licenses License: `Apache Software License` @@ -5331,7 +7709,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - `Source`: https://github.com/encode/httpx -## huggingface-hub (0.35.3) +## huggingface-hub (0.36.0) ### Licenses License: `Apache` @@ -5545,6 +7923,6766 @@ License: `Apache` - `Homepage`: https://github.com/huggingface/huggingface_hub +## identify (2.6.15) + +### Licenses +License: `MIT` + + - `vendor/licenses.py`: +``` +from __future__ import annotations +LICENSES = ( + ( + '0BSD', + '''\ +Copyright (c) [year] [fullname] + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. +''', + ), + ( + 'AFL-3.0', + '''\ +Academic Free License (“AFL”) v. 3.0 + +This Academic Free License (the "License") applies to any original work of +authorship (the "Original Work") whose owner (the "Licensor") has placed the +following licensing notice adjacent to the copyright notice for the Original +Work: + + Licensed under the Academic Free License version 3.0 + +1) Grant of Copyright License. Licensor grants You a worldwide, royalty-free, +non-exclusive, sublicensable license, for the duration of the copyright, to do +the following: + + a) to reproduce the Original Work in copies, either alone or as part of a + collective work; + + b) to translate, adapt, alter, transform, modify, or arrange the Original + Work, thereby creating derivative works ("Derivative Works") based upon + the Original Work; + + c) to distribute or communicate copies of the Original Work and + Derivative Works to the public, under any license of your choice that + does not contradict the terms and conditions, including Licensor’s + reserved rights and remedies, in this Academic Free License; + d) to perform the Original Work publicly; and + e) to display the Original Work publicly. + +2) Grant of Patent License. Licensor grants You a worldwide, royalty-free, +non-exclusive, sublicensable license, under patent claims owned or controlled +by the Licensor that are embodied in the Original Work as furnished by the +Licensor, for the duration of the patents, to make, use, sell, offer for sale, +have made, and import the Original Work and Derivative Works. + +3) Grant of Source Code License. The term "Source Code" means the preferred +form of the Original Work for making modifications to it and all available +documentation describing how to modify the Original Work. Licensor agrees to +provide a machine-readable copy of the Source Code of the Original Work along +with each copy of the Original Work that Licensor distributes. Licensor +reserves the right to satisfy this obligation by placing a machine-readable +copy of the Source Code in an information repository reasonably calculated to +permit inexpensive and convenient access by You for as long as Licensor +continues to distribute the Original Work. + +4) Exclusions From License Grant. Neither the names of Licensor, nor the names +of any contributors to the Original Work, nor any of their trademarks or +service marks, may be used to endorse or promote products derived from this +Original Work without express prior permission of the Licensor. Except as +expressly stated herein, nothing in this License grants any license to +Licensor’s trademarks, copyrights, patents, trade secrets or any other +intellectual property. No patent license is granted to make, use, sell, offer +for sale, have made, or import embodiments of any patent claims other than the +licensed claims defined in Section 2. No license is granted to the trademarks +of Licensor even if such marks are included in the Original Work. Nothing in +this License shall be interpreted to prohibit Licensor from licensing under +terms different from this License any Original Work that Licensor otherwise +would have a right to license. + +5) External Deployment. The term "External Deployment" means the use, +distribution, or communication of the Original Work or Derivative Works in any +way such that the Original Work or Derivative Works may be used by anyone +other than You, whether those works are distributed or communicated to those +persons or made available as an application intended for use over a network. +As an express condition for the grants of license hereunder, You must treat +any External Deployment by You of the Original Work or a Derivative Work as a +distribution under section 1(c). + +6) Attribution Rights. You must retain, in the Source Code of any Derivative +Works that You create, all copyright, patent, or trademark notices from the +Source Code of the Original Work, as well as any notices of licensing and any +descriptive text identified therein as an "Attribution Notice." You must cause +the Source Code for any Derivative Works that You create to carry a prominent +Attribution Notice reasonably calculated to inform recipients that You have +modified the Original Work. + +7) Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that +the copyright in and to the Original Work and the patent rights granted herein +by Licensor are owned by the Licensor or are sublicensed to You under the +terms of this License with the permission of the contributor(s) of those +copyrights and patent rights. Except as expressly stated in the immediately +preceding sentence, the Original Work is provided under this License on an "AS +IS" BASIS and WITHOUT WARRANTY, either express or implied, including, without +limitation, the warranties of non-infringement, merchantability or fitness for +a particular purpose. THE ENTIRE RISK AS TO THE QUALITY OF THE ORIGINAL WORK +IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an essential part of this +License. No license to the Original Work is granted by this License except +under this disclaimer. + +8) Limitation of Liability. Under no circumstances and under no legal theory, +whether in tort (including negligence), contract, or otherwise, shall the +Licensor be liable to anyone for any indirect, special, incidental, or +consequential damages of any character arising as a result of this License or +the use of the Original Work including, without limitation, damages for loss +of goodwill, work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses. This limitation of liability shall not +apply to the extent applicable law prohibits such limitation. + +9) Acceptance and Termination. If, at any time, You expressly assented to this +License, that assent indicates your clear and irrevocable acceptance of this +License and all of its terms and conditions. If You distribute or communicate +copies of the Original Work or a Derivative Work, You must make a reasonable +effort under the circumstances to obtain the express assent of recipients to +the terms of this License. This License conditions your rights to undertake +the activities listed in Section 1, including your right to create Derivative +Works based upon the Original Work, and doing so without honoring these terms +and conditions is prohibited by copyright law and international treaty. +Nothing in this License is intended to affect copyright exceptions and +limitations (including “fair use” or “fair dealing”). This License shall +terminate immediately and You may no longer exercise any of the rights granted +to You by this License upon your failure to honor the conditions in Section +1(c). + +10) Termination for Patent Action. This License shall terminate automatically +and You may no longer exercise any of the rights granted to You by this +License as of the date You commence an action, including a cross-claim or +counterclaim, against Licensor or any licensee alleging that the Original Work +infringes a patent. This termination provision shall not apply for an action +alleging patent infringement by combinations of the Original Work with other +software or hardware. + +11) Jurisdiction, Venue and Governing Law. Any action or suit relating to this +License may be brought only in the courts of a jurisdiction wherein the +Licensor resides or in which Licensor conducts its primary business, and under +the laws of that jurisdiction excluding its conflict-of-law provisions. The +application of the United Nations Convention on Contracts for the +International Sale of Goods is expressly excluded. Any use of the Original +Work outside the scope of this License or after its termination shall be +subject to the requirements and penalties of copyright or patent law in the +appropriate jurisdiction. This section shall survive the termination of this +License. + +12) Attorneys’ Fees. In any action to enforce the terms of this License or +seeking damages relating thereto, the prevailing party shall be entitled to +recover its costs and expenses, including, without limitation, reasonable +attorneys' fees and costs incurred in connection with such action, including +any appeal of such action. This section shall survive the termination of this +License. + +13) Miscellaneous. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent necessary +to make it enforceable. + +14) Definition of "You" in This License. "You" throughout this License, +whether in upper or lower case, means an individual or a legal entity +exercising rights under, and complying with all of the terms of, this License. +For legal entities, "You" includes any entity that controls, is controlled by, +or is under common control with you. For purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the direction or +management of such entity, whether by contract or otherwise, or (ii) ownership +of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial +ownership of such entity. + +15) Right to Use. You may use the Original Work in all ways not otherwise +restricted or conditioned by this License or by law, and Licensor promises not +to interfere with or be responsible for such uses by You. + +16) Modification of This License. This License is Copyright © 2005 Lawrence +Rosen. Permission is granted to copy, distribute, or communicate this License +without modification. Nothing in this License permits You to modify this +License as applied to the Original Work or to Derivative Works. However, You +may modify the text of this License and copy, distribute or communicate your +modified version (the "Modified License") and apply it to other original works +of authorship subject to the following conditions: (i) You may not indicate in +any way that your Modified License is the "Academic Free License" or "AFL" and +you may not use those names in the name of your Modified License; (ii) You +must replace the notice specified in the first paragraph above with the notice +"Licensed under " or with a notice of your own +that is not confusingly similar to the notice in this License; and (iii) You +may not claim that your original works are open source software unless your +Modified License has been approved by Open Source Initiative (OSI) and You +comply with its license review and certification process. +''', + ), + ( + 'AGPL-3.0', + '''\ +GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. +''', + ), + ( + 'Apache-2.0', + '''\ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +''', + ), + ( + 'Artistic-2.0', + '''\ +The Artistic License 2.0 + + Copyright (c) 2000-2006, The Perl Foundation. + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +Preamble + +This license establishes the terms under which a given free software +Package may be copied, modified, distributed, and/or redistributed. +The intent is that the Copyright Holder maintains some artistic +control over the development of that Package while still keeping the +Package available as open source and free software. + +You are always permitted to make arrangements wholly outside of this +license directly with the Copyright Holder of a given Package. If the +terms of this license do not permit the full use that you propose to +make of the Package, you should contact the Copyright Holder and seek +a different licensing arrangement. + +Definitions + + "Copyright Holder" means the individual(s) or organization(s) + named in the copyright notice for the entire Package. + + "Contributor" means any party that has contributed code or other + material to the Package, in accordance with the Copyright Holder's + procedures. + + "You" and "your" means any person who would like to copy, + distribute, or modify the Package. + + "Package" means the collection of files distributed by the + Copyright Holder, and derivatives of that collection and/or of + those files. A given Package may consist of either the Standard + Version, or a Modified Version. + + "Distribute" means providing a copy of the Package or making it + accessible to anyone else, or in the case of a company or + organization, to others outside of your company or organization. + + "Distributor Fee" means any fee that you charge for Distributing + this Package or providing support for this Package to another + party. It does not mean licensing fees. + + "Standard Version" refers to the Package if it has not been + modified, or has been modified only in ways explicitly requested + by the Copyright Holder. + + "Modified Version" means the Package, if it has been changed, and + such changes were not explicitly requested by the Copyright + Holder. + + "Original License" means this Artistic License as Distributed with + the Standard Version of the Package, in its current version or as + it may be modified by The Perl Foundation in the future. + + "Source" form means the source code, documentation source, and + configuration files for the Package. + + "Compiled" form means the compiled bytecode, object code, binary, + or any other form resulting from mechanical transformation or + translation of the Source form. + + +Permission for Use and Modification Without Distribution + +(1) You are permitted to use the Standard Version and create and use +Modified Versions for any purpose without restriction, provided that +you do not Distribute the Modified Version. + + +Permissions for Redistribution of the Standard Version + +(2) You may Distribute verbatim copies of the Source form of the +Standard Version of this Package in any medium without restriction, +either gratis or for a Distributor Fee, provided that you duplicate +all of the original copyright notices and associated disclaimers. At +your discretion, such verbatim copies may or may not include a +Compiled form of the Package. + +(3) You may apply any bug fixes, portability changes, and other +modifications made available from the Copyright Holder. The resulting +Package will still be considered the Standard Version, and as such +will be subject to the Original License. + + +Distribution of Modified Versions of the Package as Source + +(4) You may Distribute your Modified Version as Source (either gratis +or for a Distributor Fee, and with or without a Compiled form of the +Modified Version) provided that you clearly document how it differs +from the Standard Version, including, but not limited to, documenting +any non-standard features, executables, or modules, and provided that +you do at least ONE of the following: + + (a) make the Modified Version available to the Copyright Holder + of the Standard Version, under the Original License, so that the + Copyright Holder may include your modifications in the Standard + Version. + + (b) ensure that installation of your Modified Version does not + prevent the user installing or running the Standard Version. In + addition, the Modified Version must bear a name that is different + from the name of the Standard Version. + + (c) allow anyone who receives a copy of the Modified Version to + make the Source form of the Modified Version available to others + under + + (i) the Original License or + + (ii) a license that permits the licensee to freely copy, + modify and redistribute the Modified Version using the same + licensing terms that apply to the copy that the licensee + received, and requires that the Source form of the Modified + Version, and of any works derived from it, be made freely + available in that license fees are prohibited but Distributor + Fees are allowed. + + +Distribution of Compiled Forms of the Standard Version +or Modified Versions without the Source + +(5) You may Distribute Compiled forms of the Standard Version without +the Source, provided that you include complete instructions on how to +get the Source of the Standard Version. Such instructions must be +valid at the time of your distribution. If these instructions, at any +time while you are carrying out such distribution, become invalid, you +must provide new instructions on demand or cease further distribution. +If you provide valid instructions or cease distribution within thirty +days after you become aware that the instructions are invalid, then +you do not forfeit any of your rights under this license. + +(6) You may Distribute a Modified Version in Compiled form without +the Source, provided that you comply with Section 4 with respect to +the Source of the Modified Version. + + +Aggregating or Linking the Package + +(7) You may aggregate the Package (either the Standard Version or +Modified Version) with other packages and Distribute the resulting +aggregation provided that you do not charge a licensing fee for the +Package. Distributor Fees are permitted, and licensing fees for other +components in the aggregation are permitted. The terms of this license +apply to the use and Distribution of the Standard or Modified Versions +as included in the aggregation. + +(8) You are permitted to link Modified and Standard Versions with +other works, to embed the Package in a larger work of your own, or to +build stand-alone binary or bytecode versions of applications that +include the Package, and Distribute the result without restriction, +provided the result does not expose a direct interface to the Package. + + +Items That are Not Considered Part of a Modified Version + +(9) Works (including, but not limited to, modules and scripts) that +merely extend or make use of the Package, do not, by themselves, cause +the Package to be a Modified Version. In addition, such works are not +considered parts of the Package itself, and are not subject to the +terms of this license. + + +General Provisions + +(10) Any use, modification, and distribution of the Standard or +Modified Versions is governed by this Artistic License. By using, +modifying or distributing the Package, you accept this license. Do not +use, modify, or distribute the Package, if you do not accept this +license. + +(11) If your Modified Version has been derived from a Modified +Version made by someone other than you, you are nevertheless required +to ensure that your Modified Version complies with the requirements of +this license. + +(12) This license does not grant you the right to use any trademark, +service mark, tradename, or logo of the Copyright Holder. + +(13) This license includes the non-exclusive, worldwide, +free-of-charge patent license to make, have made, use, offer to sell, +sell, import and otherwise transfer the Package with respect to any +patent claims licensable by the Copyright Holder that are necessarily +infringed by the Package. If you institute patent litigation +(including a cross-claim or counterclaim) against any party alleging +that the Package constitutes direct or contributory patent +infringement, then this Artistic License to you shall terminate on the +date that such litigation is filed. + +(14) Disclaimer of Warranty: +THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS +IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR +NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL +LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +''', + ), + ( + 'BSD-2-Clause', + '''\ +BSD 2-Clause License + +Copyright (c) [year], [fullname] +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +''', + ), + ( + 'BSD-3-Clause', + '''\ +BSD 3-Clause License + +Copyright (c) [year], [fullname] +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +''', + ), + ( + 'BSD-3-Clause-Clear', + '''\ +The Clear BSD License + +Copyright (c) [year] [fullname] +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the disclaimer +below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +''', + ), + ( + 'BSL-1.0', + '''\ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +''', + ), + ( + 'CC-BY-4.0', + '''\ +Attribution 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution 4.0 International Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution 4.0 International Public License ("Public License"). To the +extent this Public License may be interpreted as a contract, You are +granted the Licensed Rights in consideration of Your acceptance of +these terms and conditions, and the Licensor grants You such rights in +consideration of benefits the Licensor receives from making the +Licensed Material available under these terms and conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. +''', + ), + ( + 'CC-BY-SA-4.0', + '''\ +Attribution-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-ShareAlike 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-ShareAlike 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + l. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + m. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + + including for purposes of Section 3(b); and + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. +''', + ), + ( + 'CC0-1.0', + '''\ +CC0 1.0 Universal + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator and +subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the +purpose of contributing to a commons of creative, cultural and scientific +works ("Commons") that the public can reliably and without fear of later +claims of infringement build upon, modify, incorporate in other works, reuse +and redistribute as freely as possible in any form whatsoever and for any +purposes, including without limitation commercial purposes. These owners may +contribute to the Commons to promote the ideal of a free culture and the +further production of creative, cultural and scientific works, or to gain +reputation or greater distribution for their Work in part through the use and +efforts of others. + +For these and/or other purposes and motivations, and without any expectation +of additional consideration or compensation, the person associating CC0 with a +Work (the "Affirmer"), to the extent that he or she is an owner of Copyright +and Related Rights in the Work, voluntarily elects to apply CC0 to the Work +and publicly distribute the Work under its terms, with knowledge of his or her +Copyright and Related Rights in the Work and the meaning and intended legal +effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not limited +to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, communicate, + and translate a Work; + + ii. moral rights retained by the original author(s) and/or performer(s); + + iii. publicity and privacy rights pertaining to a person's image or likeness + depicted in a Work; + + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + + v. rights protecting the extraction, dissemination, use and reuse of data in + a Work; + + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation thereof, + including any amended or successor version of such directive); and + + vii. other similar, equivalent or corresponding rights throughout the world + based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, +applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and +unconditionally waives, abandons, and surrenders all of Affirmer's Copyright +and Related Rights and associated claims and causes of action, whether now +known or unknown (including existing as well as future claims and causes of +action), in the Work (i) in all territories worldwide, (ii) for the maximum +duration provided by applicable law or treaty (including future time +extensions), (iii) in any current or future medium and for any number of +copies, and (iv) for any purpose whatsoever, including without limitation +commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes +the Waiver for the benefit of each member of the public at large and to the +detriment of Affirmer's heirs and successors, fully intending that such Waiver +shall not be subject to revocation, rescission, cancellation, termination, or +any other legal or equitable action to disrupt the quiet enjoyment of the Work +by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be +judged legally invalid or ineffective under applicable law, then the Waiver +shall be preserved to the maximum extent permitted taking into account +Affirmer's express Statement of Purpose. In addition, to the extent the Waiver +is so judged Affirmer hereby grants to each affected person a royalty-free, +non transferable, non sublicensable, non exclusive, irrevocable and +unconditional license to exercise Affirmer's Copyright and Related Rights in +the Work (i) in all territories worldwide, (ii) for the maximum duration +provided by applicable law or treaty (including future time extensions), (iii) +in any current or future medium and for any number of copies, and (iv) for any +purpose whatsoever, including without limitation commercial, advertising or +promotional purposes (the "License"). The License shall be deemed effective as +of the date CC0 was applied by Affirmer to the Work. Should any part of the +License for any reason be judged legally invalid or ineffective under +applicable law, such partial invalidity or ineffectiveness shall not +invalidate the remainder of the License, and in such case Affirmer hereby +affirms that he or she will not (i) exercise any of his or her remaining +Copyright and Related Rights in the Work or (ii) assert any associated claims +and causes of action with respect to the Work, in either case contrary to +Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + + b. Affirmer offers the Work as-is and makes no representations or warranties + of any kind concerning the Work, express, implied, statutory or otherwise, + including without limitation warranties of title, merchantability, fitness + for a particular purpose, non infringement, or the absence of latent or + other defects, accuracy, or the present or absence of errors, whether or not + discoverable, all to the greatest extent permissible under applicable law. + + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without limitation + any person's Copyright and Related Rights in the Work. Further, Affirmer + disclaims responsibility for obtaining any necessary consents, permissions + or other rights required for any use of the Work. + + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to this + CC0 or use of the Work. + +For more information, please see + +''', + ), + ( + 'ECL-2.0', + '''\ +Educational Community License + +Version 2.0, April 2007 + +http://opensource.org/licenses/ECL-2.0 + +The Educational Community License version 2.0 ("ECL") consists of the Apache +2.0 license, modified to change the scope of the patent grant in section 3 to +be specific to the needs of the education communities using this license. The +original Apache 2.0 license can be found at: +http://www.apache.org/licenses/LICENSE-2.0 + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the +copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other +entities that control, are controlled by, or are under common control with +that entity. For the purposes of this definition, "control" means (i) the +power, direct or indirect, to cause the direction or management of such +entity, whether by contract or otherwise, or (ii) ownership of fifty percent +(50%) or more of the outstanding shares, or (iii) beneficial ownership of such +entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation source, and +configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object +code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, +made available under the License, as indicated by a copyright notice that is +included in or attached to the work (an example is provided in the Appendix +below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative +Works shall not include works that remain separable from, or merely link (or +bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original +version of the Work and any modifications or additions to that Work or +Derivative Works thereof, that is intentionally submitted to Licensor for +inclusion in the Work by the copyright owner or by an individual or Legal +Entity authorized to submit on behalf of the copyright owner. For the purposes +of this definition, "submitted" means any form of electronic, verbal, or +written communication sent to the Licensor or its representatives, including +but not limited to communication on electronic mailing lists, source code +control systems, and issue tracking systems that are managed by, or on behalf +of, the Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise designated +in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and +such Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such +Contributor that are necessarily infringed by their Contribution(s) alone or +by combination of their Contribution(s) with the Work to which such +Contribution(s) was submitted. If You institute patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging that +the Work or a Contribution incorporated within the Work constitutes direct or +contributory patent infringement, then any patent licenses granted to You +under this License for that Work shall terminate as of the date such +litigation is filed. Any patent license granted hereby with respect to +contributions by an individual employed by an institution or organization is +limited to patent claims where the individual that is the author of the Work +is also the inventor of the patent claims licensed, and where the organization +or institution has the right to grant such license under applicable grant and +research funding agreements. No other express or implied licenses are granted. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works +thereof in any medium, with or without modifications, and in Source or Object +form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and You must cause any modified files to carry prominent notices +stating that You changed the files; and You must retain, in the Source form of +any Derivative Works that You distribute, all copyright, patent, trademark, +and attribution notices from the Source form of the Work, excluding those +notices that do not pertain to any part of the Derivative Works; and If the +Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of +the following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. You may add Your own copyright statement to Your +modifications and may provide additional or different license terms and +conditions for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, reproduction, and +distribution of the Work otherwise complies with the conditions stated in this +License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally +submitted for inclusion in the Work by You to the Licensor shall be under the +terms and conditions of this License, without any additional terms or +conditions. Notwithstanding the above, nothing herein shall supersede or +modify the terms of any separate license agreement you may have executed with +Licensor regarding such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides +the Work (and each Contributor provides its Contributions) on an "AS IS" +BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You +are solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a result of +this License or out of the use or inability to use the Work (including but not +limited to damages for loss of goodwill, work stoppage, computer failure or +malfunction, or any and all other commercial damages or losses), even if such +Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. +However, in accepting such obligations, You may act only on Your own behalf +and on Your sole responsibility, not on behalf of any other Contributor, and +only if You agree to indemnify, defend, and hold each Contributor harmless for +any liability incurred by, or claims asserted against, such Contributor by +reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Educational Community License to your work + +To apply the Educational Community License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" replaced with +your own identifying information. (Don't include the brackets!) The text +should be enclosed in the appropriate comment syntax for the file format. We +also recommend that a file or class name and description of purpose be +included on the same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] Licensed under the Educational +Community License, Version 2.0 (the "License"); you may not use this file +except in compliance with the License. You may obtain a copy of the License at + +http://opensource.org/licenses/ECL-2.0 + + Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations under +the License. +''', + ), + ( + 'EPL-1.0', + '''\ +Eclipse Public License - v 1.0 + +THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC +LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM +CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + a) in the case of the initial Contributor, the initial code and + documentation distributed under this Agreement, and + b) in the case of each subsequent Contributor: + i) changes to the Program, and + ii) additions to the Program; + +where such changes and/or additions to the Program originate from and are +distributed by that particular Contributor. A Contribution 'originates' from a +Contributor if it was added to the Program by such Contributor itself or +anyone acting on such Contributor's behalf. Contributions do not include +additions to the Program which: (i) are separate modules of software +distributed in conjunction with the Program under their own license agreement, +and (ii) are not derivative works of the Program. +"Contributor" means any person or entity that distributes the Program. + +"Licensed Patents" mean patent claims licensable by a Contributor which are +necessarily infringed by the use or sale of its Contribution alone or when +combined with the Program. + +"Program" means the Contributions distributed in accordance with this +Agreement. + +"Recipient" means anyone who receives the Program under this Agreement, +including all Contributors. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby grants + Recipient a non-exclusive, worldwide, royalty-free copyright license to + reproduce, prepare derivative works of, publicly display, publicly + perform, distribute and sublicense the Contribution of such Contributor, + if any, and such derivative works, in source code and object code form. + + b) Subject to the terms of this Agreement, each Contributor hereby grants + Recipient a non-exclusive, worldwide, royalty-free patent license under + Licensed Patents to make, use, sell, offer to sell, import and otherwise + transfer the Contribution of such Contributor, if any, in source code and + object code form. This patent license shall apply to the combination of + the Contribution and the Program if, at the time the Contribution is + added by the Contributor, such addition of the Contribution causes such + combination to be covered by the Licensed Patents. The patent license + shall not apply to any other combinations which include the Contribution. + No hardware per se is licensed hereunder. + + c) Recipient understands that although each Contributor grants the + licenses to its Contributions set forth herein, no assurances are + provided by any Contributor that the Program does not infringe the patent + or other intellectual property rights of any other entity. Each + Contributor disclaims any liability to Recipient for claims brought by + any other entity based on infringement of intellectual property rights or + otherwise. As a condition to exercising the rights and licenses granted + hereunder, each Recipient hereby assumes sole responsibility to secure + any other intellectual property rights needed, if any. For example, if a + third party patent license is required to allow Recipient to distribute + the Program, it is Recipient's responsibility to acquire that license + before distributing the Program. + + d) Each Contributor represents that to its knowledge it has sufficient + copyright rights in its Contribution, if any, to grant the copyright + license set forth in this Agreement. + +3. REQUIREMENTS +A Contributor may choose to distribute the Program in object code form under +its own license agreement, provided that: + + a) it complies with the terms and conditions of this Agreement; and + + b) its license agreement: + i) effectively disclaims on behalf of all Contributors all + warranties and conditions, express and implied, including warranties + or conditions of title and non-infringement, and implied warranties + or conditions of merchantability and fitness for a particular + purpose; + ii) effectively excludes on behalf of all Contributors all liability + for damages, including direct, indirect, special, incidental and + consequential damages, such as lost profits; + iii) states that any provisions which differ from this Agreement are + offered by that Contributor alone and not by any other party; and + iv) states that source code for the Program is available from such + Contributor, and informs licensees how to obtain it in a reasonable + manner on or through a medium customarily used for software + exchange. + +When the Program is made available in source code form: + + a) it must be made available under this Agreement; and + + b) a copy of this Agreement must be included with each copy of the + Program. +Contributors may not remove or alter any copyright notices contained within +the Program. + +Each Contributor must identify itself as the originator of its Contribution, +if any, in a manner that reasonably allows subsequent Recipients to identify +the originator of the Contribution. + +4. COMMERCIAL DISTRIBUTION +Commercial distributors of software may accept certain responsibilities with +respect to end users, business partners and the like. While this license is +intended to facilitate the commercial use of the Program, the Contributor who +includes the Program in a commercial product offering should do so in a manner +which does not create potential liability for other Contributors. Therefore, +if a Contributor includes the Program in a commercial product offering, such +Contributor ("Commercial Contributor") hereby agrees to defend and indemnify +every other Contributor ("Indemnified Contributor") against any losses, +damages and costs (collectively "Losses") arising from claims, lawsuits and +other legal actions brought by a third party against the Indemnified +Contributor to the extent caused by the acts or omissions of such Commercial +Contributor in connection with its distribution of the Program in a commercial +product offering. The obligations in this section do not apply to any claims +or Losses relating to any actual or alleged intellectual property +infringement. In order to qualify, an Indemnified Contributor must: a) +promptly notify the Commercial Contributor in writing of such claim, and b) +allow the Commercial Contributor to control, and cooperate with the Commercial +Contributor in, the defense and any related settlement negotiations. The +Indemnified Contributor may participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial product +offering, Product X. That Contributor is then a Commercial Contributor. If +that Commercial Contributor then makes performance claims, or offers +warranties related to Product X, those performance claims and warranties are +such Commercial Contributor's responsibility alone. Under this section, the +Commercial Contributor would have to defend claims against the other +Contributors related to those performance claims and warranties, and if a +court requires any other Contributor to pay any damages as a result, the +Commercial Contributor must pay those damages. + +5. NO WARRANTY +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR +IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, +NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each +Recipient is solely responsible for determining the appropriateness of using +and distributing the Program and assumes all risks associated with its +exercise of rights under this Agreement , including but not limited to the +risks and costs of program errors, compliance with applicable laws, damage to +or loss of data, programs or equipment, and unavailability or interruption of +operations. + +6. DISCLAIMER OF LIABILITY +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY +CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION +LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE +EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY +OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under +applicable law, it shall not affect the validity or enforceability of the +remainder of the terms of this Agreement, and without further action by the +parties hereto, such provision shall be reformed to the minimum extent +necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Program itself +(excluding combinations of the Program with other software or hardware) +infringes such Recipient's patent(s), then such Recipient's rights granted +under Section 2(b) shall terminate as of the date such litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it fails to +comply with any of the material terms or conditions of this Agreement and does +not cure such failure in a reasonable period of time after becoming aware of +such noncompliance. If all Recipient's rights under this Agreement terminate, +Recipient agrees to cease use and distribution of the Program as soon as +reasonably practicable. However, Recipient's obligations under this Agreement +and any licenses granted by Recipient relating to the Program shall continue +and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, but in +order to avoid inconsistency the Agreement is copyrighted and may only be +modified in the following manner. The Agreement Steward reserves the right to +publish new versions (including revisions) of this Agreement from time to +time. No one other than the Agreement Steward has the right to modify this +Agreement. The Eclipse Foundation is the initial Agreement Steward. The +Eclipse Foundation may assign the responsibility to serve as the Agreement +Steward to a suitable separate entity. Each new version of the Agreement will +be given a distinguishing version number. The Program (including +Contributions) may always be distributed subject to the version of the +Agreement under which it was received. In addition, after a new version of the +Agreement is published, Contributor may elect to distribute the Program +(including its Contributions) under the new version. Except as expressly +stated in Sections 2(a) and 2(b) above, Recipient receives no rights or +licenses to the intellectual property of any Contributor under this Agreement, +whether expressly, by implication, estoppel or otherwise. All rights in the +Program not expressly granted under this Agreement are reserved. + +This Agreement is governed by the laws of the State of New York and the +intellectual property laws of the United States of America. No party to this +Agreement will bring a legal action under this Agreement more than one year +after the cause of action arose. Each party waives its rights to a jury trial +in any resulting litigation. +''', + ), + ( + 'EPL-2.0', + '''\ +Eclipse Public License - v 2.0 + + THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE + PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION + OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a) in the case of the initial Contributor, the initial content + Distributed under this Agreement, and + + b) in the case of each subsequent Contributor: + i) changes to the Program, and + ii) additions to the Program; + where such changes and/or additions to the Program originate from + and are Distributed by that particular Contributor. A Contribution + "originates" from a Contributor if it was added to the Program by + such Contributor itself or anyone acting on such Contributor's behalf. + Contributions do not include changes or additions to the Program that + are not Modified Works. + +"Contributor" means any person or entity that Distributes the Program. + +"Licensed Patents" mean patent claims licensable by a Contributor which +are necessarily infringed by the use or sale of its Contribution alone +or when combined with the Program. + +"Program" means the Contributions Distributed in accordance with this +Agreement. + +"Recipient" means anyone who receives the Program under this Agreement +or any Secondary License (as applicable), including Contributors. + +"Derivative Works" shall mean any work, whether in Source Code or other +form, that is based on (or derived from) the Program and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. + +"Modified Works" shall mean any work in Source Code or other form that +results from an addition to, deletion from, or modification of the +contents of the Program, including, for purposes of clarity any new file +in Source Code form that contains any contents of the Program. Modified +Works shall not include works that contain only declarations, +interfaces, types, classes, structures, or files of the Program solely +in each case in order to link to, bind by name, or subclass the Program +or Modified Works thereof. + +"Distribute" means the acts of a) distributing or b) making available +in any manner that enables the transfer of a copy. + +"Source Code" means the form of a Program preferred for making +modifications, including but not limited to software source code, +documentation source, and configuration files. + +"Secondary License" means either the GNU General Public License, +Version 2.0, or any later versions of that license, including any +exceptions or additional permissions as identified by the initial +Contributor. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free copyright + license to reproduce, prepare Derivative Works of, publicly display, + publicly perform, Distribute and sublicense the Contribution of such + Contributor, if any, and such Derivative Works. + + b) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free patent + license under Licensed Patents to make, use, sell, offer to sell, + import and otherwise transfer the Contribution of such Contributor, + if any, in Source Code or other form. This patent license shall + apply to the combination of the Contribution and the Program if, at + the time the Contribution is added by the Contributor, such addition + of the Contribution causes such combination to be covered by the + Licensed Patents. The patent license shall not apply to any other + combinations which include the Contribution. No hardware per se is + licensed hereunder. + + c) Recipient understands that although each Contributor grants the + licenses to its Contributions set forth herein, no assurances are + provided by any Contributor that the Program does not infringe the + patent or other intellectual property rights of any other entity. + Each Contributor disclaims any liability to Recipient for claims + brought by any other entity based on infringement of intellectual + property rights or otherwise. As a condition to exercising the + rights and licenses granted hereunder, each Recipient hereby + assumes sole responsibility to secure any other intellectual + property rights needed, if any. For example, if a third party + patent license is required to allow Recipient to Distribute the + Program, it is Recipient's responsibility to acquire that license + before distributing the Program. + + d) Each Contributor represents that to its knowledge it has + sufficient copyright rights in its Contribution, if any, to grant + the copyright license set forth in this Agreement. + + e) Notwithstanding the terms of any Secondary License, no + Contributor makes additional grants to any Recipient (other than + those set forth in this Agreement) as a result of such Recipient's + receipt of the Program under the terms of a Secondary License + (if permitted under the terms of Section 3). + +3. REQUIREMENTS + +3.1 If a Contributor Distributes the Program in any form, then: + + a) the Program must also be made available as Source Code, in + accordance with section 3.2, and the Contributor must accompany + the Program with a statement that the Source Code for the Program + is available under this Agreement, and informs Recipients how to + obtain it in a reasonable manner on or through a medium customarily + used for software exchange; and + + b) the Contributor may Distribute the Program under a license + different than this Agreement, provided that such license: + i) effectively disclaims on behalf of all other Contributors all + warranties and conditions, express and implied, including + warranties or conditions of title and non-infringement, and + implied warranties or conditions of merchantability and fitness + for a particular purpose; + + ii) effectively excludes on behalf of all other Contributors all + liability for damages, including direct, indirect, special, + incidental and consequential damages, such as lost profits; + + iii) does not attempt to limit or alter the recipients' rights + in the Source Code under section 3.2; and + + iv) requires any subsequent distribution of the Program by any + party to be under a license that satisfies the requirements + of this section 3. + +3.2 When the Program is Distributed as Source Code: + + a) it must be made available under this Agreement, or if the + Program (i) is combined with other material in a separate file or + files made available under a Secondary License, and (ii) the initial + Contributor attached to the Source Code the notice described in + Exhibit A of this Agreement, then the Program may be made available + under the terms of such Secondary Licenses, and + + b) a copy of this Agreement must be included with each copy of + the Program. + +3.3 Contributors may not remove or alter any copyright, patent, +trademark, attribution notices, disclaimers of warranty, or limitations +of liability ("notices") contained within the Program from any copy of +the Program which they Distribute, provided that Contributors may add +their own appropriate notices. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain responsibilities +with respect to end users, business partners and the like. While this +license is intended to facilitate the commercial use of the Program, +the Contributor who includes the Program in a commercial product +offering should do so in a manner which does not create potential +liability for other Contributors. Therefore, if a Contributor includes +the Program in a commercial product offering, such Contributor +("Commercial Contributor") hereby agrees to defend and indemnify every +other Contributor ("Indemnified Contributor") against any losses, +damages and costs (collectively "Losses") arising from claims, lawsuits +and other legal actions brought by a third party against the Indemnified +Contributor to the extent caused by the acts or omissions of such +Commercial Contributor in connection with its distribution of the Program +in a commercial product offering. The obligations in this section do not +apply to any claims or Losses relating to any actual or alleged +intellectual property infringement. In order to qualify, an Indemnified +Contributor must: a) promptly notify the Commercial Contributor in +writing of such claim, and b) allow the Commercial Contributor to control, +and cooperate with the Commercial Contributor in, the defense and any +related settlement negotiations. The Indemnified Contributor may +participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial +product offering, Product X. That Contributor is then a Commercial +Contributor. If that Commercial Contributor then makes performance +claims, or offers warranties related to Product X, those performance +claims and warranties are such Commercial Contributor's responsibility +alone. Under this section, the Commercial Contributor would have to +defend claims against the other Contributors related to those performance +claims and warranties, and if a court requires any other Contributor to +pay any damages as a result, the Commercial Contributor must pay +those damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS" +BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR +IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF +TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR +PURPOSE. Each Recipient is solely responsible for determining the +appropriateness of using and distributing the Program and assumes all +risks associated with its exercise of rights under this Agreement, +including but not limited to the risks and costs of program errors, +compliance with applicable laws, damage to or loss of data, programs +or equipment, and unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS +SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST +PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE +EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under +applicable law, it shall not affect the validity or enforceability of +the remainder of the terms of this Agreement, and without further +action by the parties hereto, such provision shall be reformed to the +minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against any entity +(including a cross-claim or counterclaim in a lawsuit) alleging that the +Program itself (excluding combinations of the Program with other software +or hardware) infringes such Recipient's patent(s), then such Recipient's +rights granted under Section 2(b) shall terminate as of the date such +litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it +fails to comply with any of the material terms or conditions of this +Agreement and does not cure such failure in a reasonable period of +time after becoming aware of such noncompliance. If all Recipient's +rights under this Agreement terminate, Recipient agrees to cease use +and distribution of the Program as soon as reasonably practicable. +However, Recipient's obligations under this Agreement and any licenses +granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, +but in order to avoid inconsistency the Agreement is copyrighted and +may only be modified in the following manner. The Agreement Steward +reserves the right to publish new versions (including revisions) of +this Agreement from time to time. No one other than the Agreement +Steward has the right to modify this Agreement. The Eclipse Foundation +is the initial Agreement Steward. The Eclipse Foundation may assign the +responsibility to serve as the Agreement Steward to a suitable separate +entity. Each new version of the Agreement will be given a distinguishing +version number. The Program (including Contributions) may always be +Distributed subject to the version of the Agreement under which it was +received. In addition, after a new version of the Agreement is published, +Contributor may elect to Distribute the Program (including its +Contributions) under the new version. + +Except as expressly stated in Sections 2(a) and 2(b) above, Recipient +receives no rights or licenses to the intellectual property of any +Contributor under this Agreement, whether expressly, by implication, +estoppel or otherwise. All rights in the Program not expressly granted +under this Agreement are reserved. Nothing in this Agreement is intended +to be enforceable by any entity that is not a Contributor or Recipient. +No third-party beneficiary rights are created under this Agreement. + +Exhibit A - Form of Secondary Licenses Notice + +"This Source Code may also be made available under the followingSecondary Licenses when the conditions for such availability set forthin the Eclipse Public License, v. 2.0 are satisfied: {name license(s), +version(s), and exceptions or additional permissions here}." + + Simply including a copy of this Agreement, including this Exhibit A + is not sufficient to license the Source Code under Secondary Licenses. + + If it is not possible or desirable to put the notice in a particular + file, then You may include the notice in a location (such as a LICENSE + file in a relevant directory) where a recipient would be likely to + look for such a notice. + + You may add additional accurate notices of copyright ownership. +''', + ), + ( + 'EUPL-1.1', + '''\ +European Union Public Licence +V. 1.1 + + +EUPL © the European Community 2007 + + +This European Union Public Licence (the “EUPL”) applies to the +Work or Software (as defined below) which is provided under the terms of this +Licence. Any use of the Work, other than as authorised under this Licence is +prohibited (to the extent such use is covered by a right of the copyright +holder of the Work). + +The Original Work is provided under the terms of this +Licence when the Licensor (as defined below) has placed the following notice +immediately following the copyright notice for the Original Work: + +Licensed under the EUPL V.1.1 + +or has expressed by any other mean his willingness to license under the EUPL. + + +1. Definitions + +In this Licence, the +following terms have the following meaning: + +- The Licence: this Licence. + +- The Original Work or the Software: the software distributed +and/or communicated by the Licensor under this Licence, available as Source +Code and also as Executable Code as the case may be. + +- Derivative Works: +the works or software that could be created by the Licensee, based upon the +Original Work or modifications thereof. This Licence does not define the +extent of modification or dependence on the Original Work required in order to +classify a work as a Derivative Work; this extent is determined by copyright +law applicable in the country mentioned in Article 15. + +- The Work: the Original Work and/or its Derivative Works. + +- The Source Code: the human-readable form of the Work which is the most +convenient for people to study and modify. + +- The Executable Code: any code which has generally been compiled and which +is meant to be interpreted by a computer as a program. + +- The Licensor: the natural or legal person that distributes and/or +communicates the Work under the Licence. + +- Contributor(s): any natural or legal person who modifies the Work under the +Licence, or otherwise contributes to the creation of a Derivative Work. + +- The Licensee or “You”: any natural or legal person who makes any usage of +the Software under the terms of the Licence. + +- Distribution and/or Communication: any act of selling, giving, lending, +renting, distributing, communicating, transmitting, or otherwise +making available, on-line or off-line, copies of the Work or providing access +to its essential functionalities at the disposal of any other natural or legal +person. + + +2. Scope of the rights granted by the Licence + +The Licensor hereby grants You a world-wide, royalty-free, non-exclusive, +sub-licensable licence to do the following, for the duration of copyright +vested in the Original Work: + +- use the Work in any circumstance and for all usage, +- reproduce the Work, +- modify the Original Work, and make Derivative Works +based upon the Work, +- communicate to the public, including the right to make available or display +the Work or copies thereof to the public and perform publicly, as the case +may be, the Work, +- distribute the Work or copies thereof, +- lend and rent the Work or copies thereof, +- sub-license rights in the Work or copies thereof. + +Those rights can be exercised on any media, supports and formats, whether now +known or later invented, as far as the applicable law permits so. + +In the countries where moral rights apply, the Licensor waives his right to +exercise his moral right to the extent allowed by law in order to make +effective the licence of the economic rights here above listed. + +The Licensor grants to the Licensee royalty-free, non exclusive usage rights +to any patents held by the Licensor, to the extent necessary to make use of +the rights granted on the Work under this Licence. + + +3. Communication of the Source Code + +The Licensor may provide the Work either +in its Source Code form, or as Executable Code. If the Work is provided as +Executable Code, the Licensor provides in addition a machine-readable copy of +the Source Code of the Work along with each copy of the Work that the Licensor +distributes or indicates, in a notice following the copyright notice attached +to the Work, a repository where the Source Code is easily and freely +accessible for as long as the Licensor continues to distribute and/or +communicate the Work. + + +4. Limitations on copyright + +Nothing in this Licence is intended to deprive the Licensee of the benefits +from any exception or limitation to the exclusive rights of the rights owners +in the Original Work or Software, of the exhaustion of those rights or of +other applicable limitations thereto. + + +5. Obligations of the Licensee + +The grant of the rights mentioned above is subject to some restrictions and +obligations imposed on the Licensee. Those obligations are the following: + +Attribution right: +the Licensee shall keep intact all copyright, patent or trademarks notices and +all notices that refer to the Licence and to the disclaimer of warranties. The +Licensee must include a copy of such notices and a copy of the Licence with +every copy of the Work he/she distributes and/or communicates. The Licensee +must cause any Derivative Work to carry prominent notices stating that the +Work has been modified and the date of modification. + +Copyleft clause: +If the Licensee distributes and/or communicates copies of the Original Works +or Derivative Works based upon the Original Work, this Distribution and/or +Communication will be done under the terms of this Licence or of a later +version of this Licence unless the Original Work is expressly distributed only +under this version of the Licence. The Licensee (becoming Licensor) cannot +offer or impose any additional terms or conditions on the Work or Derivative +Work that alter or restrict the terms of the Licence. + +Compatibility clause: +If the Licensee Distributes and/or Communicates Derivative Works or copies +thereof based upon both the Original Work and another work licensed under a +Compatible Licence, this Distribution and/or Communication can be done under +the terms of this Compatible Licence. For the sake of this clause, +“Compatible Licence” refers to the licences listed in the appendix +attached to this Licence. Should the Licensee’s obligations under the +Compatible Licence conflict with his/her obligations under this Licence, the +obligations of the Compatible Licence shall prevail. + +Provision of Source Code: +When distributing and/or communicating copies of the Work, the Licensee +will provide a machine-readable copy of the Source Code or indicate a +repository where this Source will be easily and freely available for as long +as the Licensee continues to distribute and/or communicate the Work. + +Legal Protection: +This Licence does not grant permission to use the trade names, +trademarks, service marks, or names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the copyright notice. + + +6. Chain of Authorship + +The original Licensor warrants that the copyright in the Original Work +granted hereunder is owned by him/her or licensed to him/her and +that he/she has the power and authority to grant the Licence. + +Each Contributor warrants that the copyright in the modifications he/she +brings to the Work are owned by him/her or licensed to him/her and that +he/she has the power and authority to grant the Licence. + +Each time You accept the Licence, the original Licensor and subsequent +Contributors grant You a licence to their contributions to the Work, under +the terms of this Licence. + + +7. Disclaimer of Warranty + +The Work is a work in progress, which is continuously improved by numerous +contributors. It is not a finished work and may therefore contain defects or +“bugs” inherent to this type of software development. + +For the above reason, the Work is provided under the Licence on an “as is” +basis and without warranties of any kind concerning the Work, including +without limitation merchantability, fitness for a particular purpose, absence +of defects or errors, accuracy, non-infringement of intellectual property +rights other than copyright as stated in Article 6 of this Licence. + +This disclaimer of warranty is an essential part of the Licence and a +condition for the grant of any rights to the Work. + + +8. Disclaimer of Liability + +Except in the cases of wilful misconduct or damages directly caused to +natural persons, the Licensor will in no event be liable for any direct or +indirect, material or moral, damages of any kind, arising out of the Licence +or of the use of the Work, including without limitation, +damages for loss of goodwill, work stoppage, computer failure or malfunction, +loss of data or any commercial damage, even if the Licensor has been advised +of the possibility of such damage. However, the Licensor will be liable under +statutory product liability laws as far such laws apply to the Work. + + +9. Additional agreements + +While distributing the Original Work or Derivative Works, You may choose +to conclude an additional agreement to offer, and charge a fee for, +acceptance of support, warranty, indemnity, or other liability +obligations and/or services consistent with this Licence. However, in +accepting such obligations, You may act only on your own behalf and on your +sole responsibility, not on behalf of the original Licensor or any other +Contributor, and only if You agree to indemnify, defend, and hold each +Contributor harmless for any liability incurred by, or claims asserted against +such Contributor by the fact You have accepted any such warranty or additional +liability. + + +10. Acceptance of the Licence + +The provisions of this Licence can be accepted by clicking on +an icon “I agree” placed under the bottom of a window displaying the text of +this Licence or by affirming consent in any other similar way, in accordance +with the rules of applicable law. Clicking on that icon indicates your clear +and irrevocable acceptance of this Licence and +all of its terms and conditions. + +Similarly, you irrevocably accept this Licence and +all of its terms and conditions by exercising any rights granted to You +by Article 2 of this Licence, such as the use of the Work, +the creation by You of a Derivative Work or the Distribution and/or +Communication by You of the Work or copies thereof. + + +11. Information to the public + +In case of any Distribution and/or Communication of the Work by means of +electronic communication by You (for example, by offering to download +the Work from a remote location) the distribution channel or media (for +example, a website) must at least provide to the public the information +requested by the applicable law regarding the Licensor, the Licence and the +way it may be accessible, concluded, stored and reproduced by the +Licensee. + + +12. Termination of the Licence + +The Licence and the rights granted hereunder will terminate automatically +upon any breach by the Licensee of the terms of the Licence. + +Such a termination will not terminate the licences of any person who has +received the Work from the Licensee under the Licence, provided such persons +remain in full compliance with the Licence. + + +13. Miscellaneous + +Without prejudice of Article 9 above, the Licence represents the complete +agreement between the Parties as to the Work licensed hereunder. + +If any provision of the Licence is invalid or unenforceable under applicable +law, this will not affect the validity or enforceability of the Licence as a +whole. Such provision will be construed and/or reformed so as necessary +to make it valid and enforceable. + +The European Commission may publish other linguistic versions and/or new +versions of this Licence, so far this is required and reasonable, without +reducing the scope of the rights granted by the Licence. +New versions of the Licence will be published with a unique version number. + +All linguistic versions of this Licence, approved by the European Commission, +have identical value. Parties can take advantage of the linguistic version +of their choice. + + +14. Jurisdiction + +Any litigation resulting from the interpretation of this License, arising +between the European Commission, as a Licensor, and any Licensee, +will be subject to the jurisdiction of the Court of Justice of the +European Communities, as laid down in article 238 of the Treaty establishing +the European Community. + +Any litigation arising between Parties, other than the European Commission, +and resulting from the interpretation of this License, will be subject to the +exclusive jurisdiction of the competent court where the Licensor resides or +conducts its primary business. + + +15. Applicable Law + +This Licence shall be governed by the law of the European Union country where +the Licensor resides or has his registered office. + +This licence shall be governed by the Belgian law if: + +- a litigation arises between the European Commission, as a Licensor, and any +Licensee; +- the Licensor, other than the European Commission, has no residence or +registered office inside a European Union country. + + +=== + + +Appendix + + +“Compatible Licences” according to article 5 EUPL are: +- GNU General Public License (GNU GPL) v. 2 +- Open Software License (OSL) v. 2.1, v. 3.0 +- Common Public License v. 1.0 +- Eclipse Public License v. 1.0 +- Cecill v. 2.0 +''', + ), + ( + 'EUPL-1.2', + '''\ +European Union Public Licence +V. 1.2 + +EUPL © the European Union 2007, 2016 + +This European Union Public Licence (the ‘EUPL’) applies to the Work (as +defined below) which is provided under the terms of this Licence. Any use of +the Work, other than as authorised under this Licence is prohibited (to the +extent such use is covered by a right of the copyright holder of the Work). + +The Work is provided under the terms of this Licence when the Licensor (as +defined below) has placed the following notice immediately following the +copyright notice for the Work: “Licensed under the EUPL”, or has expressed by +any other means his willingness to license under the EUPL. + +1. Definitions + +In this Licence, the following terms have the following meaning: +— ‘The Licence’: this Licence. +— ‘The Original Work’: the work or software distributed or communicated by the + ‘Licensor under this Licence, available as Source Code and also as + ‘Executable Code as the case may be. +— ‘Derivative Works’: the works or software that could be created by the + ‘Licensee, based upon the Original Work or modifications thereof. This + ‘Licence does not define the extent of modification or dependence on the + ‘Original Work required in order to classify a work as a Derivative Work; + ‘this extent is determined by copyright law applicable in the country + ‘mentioned in Article 15. +— ‘The Work’: the Original Work or its Derivative Works. +— ‘The Source Code’: the human-readable form of the Work which is the most + convenient for people to study and modify. + +— ‘The Executable Code’: any code which has generally been compiled and which + is meant to be interpreted by a computer as a program. +— ‘The Licensor’: the natural or legal person that distributes or communicates + the Work under the Licence. +— ‘Contributor(s)’: any natural or legal person who modifies the Work under + the Licence, or otherwise contributes to the creation of a Derivative Work. +— ‘The Licensee’ or ‘You’: any natural or legal person who makes any usage of + the Work under the terms of the Licence. +— ‘Distribution’ or ‘Communication’: any act of selling, giving, lending, + renting, distributing, communicating, transmitting, or otherwise making + available, online or offline, copies of the Work or providing access to its + essential functionalities at the disposal of any other natural or legal + person. + +2. Scope of the rights granted by the Licence + +The Licensor hereby grants You a worldwide, royalty-free, non-exclusive, +sublicensable licence to do the following, for the duration of copyright +vested in the Original Work: + +— use the Work in any circumstance and for all usage, +— reproduce the Work, +— modify the Work, and make Derivative Works based upon the Work, +— communicate to the public, including the right to make available or display + the Work or copies thereof to the public and perform publicly, as the case + may be, the Work, +— distribute the Work or copies thereof, +— lend and rent the Work or copies thereof, +— sublicense rights in the Work or copies thereof. + +Those rights can be exercised on any media, supports and formats, whether now +known or later invented, as far as the applicable law permits so. + +In the countries where moral rights apply, the Licensor waives his right to +exercise his moral right to the extent allowed by law in order to make +effective the licence of the economic rights here above listed. + +The Licensor grants to the Licensee royalty-free, non-exclusive usage rights +to any patents held by the Licensor, to the extent necessary to make use of +the rights granted on the Work under this Licence. + +3. Communication of the Source Code + +The Licensor may provide the Work either in its Source Code form, or as +Executable Code. If the Work is provided as Executable Code, the Licensor +provides in addition a machine-readable copy of the Source Code of the Work +along with each copy of the Work that the Licensor distributes or indicates, +in a notice following the copyright notice attached to the Work, a repository +where the Source Code is easily and freely accessible for as long as the +Licensor continues to distribute or communicate the Work. + +4. Limitations on copyright + +Nothing in this Licence is intended to deprive the Licensee of the benefits +from any exception or limitation to the exclusive rights of the rights owners +in the Work, of the exhaustion of those rights or of other applicable +limitations thereto. + +5. Obligations of the Licensee + +The grant of the rights mentioned above is subject to some restrictions and +obligations imposed on the Licensee. Those obligations are the following: + +Attribution right: The Licensee shall keep intact all copyright, patent or +trademarks notices and all notices that refer to the Licence and to the +disclaimer of warranties. The Licensee must include a copy of such notices and +a copy of the Licence with every copy of the Work he/she distributes or +communicates. The Licensee must cause any Derivative Work to carry prominent +notices stating that the Work has been modified and the date of modification. + +Copyleft clause: If the Licensee distributes or communicates copies of the +Original Works or Derivative Works, this Distribution or Communication will be +done under the terms of this Licence or of a later version of this Licence +unless the Original Work is expressly distributed only under this version of +the Licence — for example by communicating ‘EUPL v. 1.2 only’. The Licensee +(becoming Licensor) cannot offer or impose any additional terms or conditions +on the Work or Derivative Work that alter or restrict the terms of the +Licence. + +Compatibility clause: If the Licensee Distributes or Communicates Derivative +Works or copies thereof based upon both the Work and another work licensed +under a Compatible Licence, this Distribution or Communication can be done +under the terms of this Compatible Licence. For the sake of this clause, +‘Compatible Licence’ refers to the licences listed in the appendix attached to +this Licence. Should the Licensee's obligations under the Compatible Licence +conflict with his/her obligations under this Licence, the obligations of the +Compatible Licence shall prevail. + +Provision of Source Code: When distributing or communicating copies of the +Work, the Licensee will provide a machine-readable copy of the Source Code or +indicate a repository where this Source will be easily and freely available +for as long as the Licensee continues to distribute or communicate the Work. + +Legal Protection: This Licence does not grant permission to use the trade +names, trademarks, service marks, or names of the Licensor, except as required +for reasonable and customary use in describing the origin of the Work and +reproducing the content of the copyright notice. + +6. Chain of Authorship + +The original Licensor warrants that the copyright in the Original Work granted +hereunder is owned by him/her or licensed to him/her and that he/she has the +power and authority to grant the Licence. + +Each Contributor warrants that the copyright in the modifications he/she +brings to the Work are owned by him/her or licensed to him/her and that he/she +has the power and authority to grant the Licence. + +Each time You accept the Licence, the original Licensor and subsequent +Contributors grant You a licence to their contributions to the Work, under the +terms of this Licence. + +7. Disclaimer of Warranty + +The Work is a work in progress, which is continuously improved by numerous +Contributors. It is not a finished work and may therefore contain defects or +‘bugs’ inherent to this type of development. + +For the above reason, the Work is provided under the Licence on an ‘as is’ +basis and without warranties of any kind concerning the Work, including +without limitation merchantability, fitness for a particular purpose, absence +of defects or errors, accuracy, non-infringement of intellectual property +rights other than copyright as stated in Article 6 of this Licence. + +This disclaimer of warranty is an essential part of the Licence and a +condition for the grant of any rights to the Work. + +8. Disclaimer of Liability + +Except in the cases of wilful misconduct or damages directly caused to natural +persons, the Licensor will in no event be liable for any direct or indirect, +material or moral, damages of any kind, arising out of the Licence or of the +use of the Work, including without limitation, damages for loss of goodwill, +work stoppage, computer failure or malfunction, loss of data or any commercial +damage, even if the Licensor has been advised of the possibility of such +damage. However, the Licensor will be liable under statutory product liability +laws as far such laws apply to the Work. + +9. Additional agreements + +While distributing the Work, You may choose to conclude an additional +agreement, defining obligations or services consistent with this Licence. +However, if accepting obligations, You may act only on your own behalf and on +your sole responsibility, not on behalf of the original Licensor or any other +Contributor, and only if You agree to indemnify, defend, and hold each +Contributor harmless for any liability incurred by, or claims asserted against +such Contributor by the fact You have accepted any warranty or additional +liability. + +10. Acceptance of the Licence + +The provisions of this Licence can be accepted by clicking on an icon ‘I +agree’ placed under the bottom of a window displaying the text of this Licence +or by affirming consent in any other similar way, in accordance with the rules +of applicable law. Clicking on that icon indicates your clear and irrevocable +acceptance of this Licence and all of its terms and conditions. + +Similarly, you irrevocably accept this Licence and all of its terms and +conditions by exercising any rights granted to You by Article 2 of this +Licence, such as the use of the Work, the creation by You of a Derivative Work +or the Distribution or Communication by You of the Work or copies thereof. + +11. Information to the public + +In case of any Distribution or Communication of the Work by means of +electronic communication by You (for example, by offering to download the Work +from a remote location) the distribution channel or media (for example, a +website) must at least provide to the public the information requested by the +applicable law regarding the Licensor, the Licence and the way it may be +accessible, concluded, stored and reproduced by the Licensee. + +12. Termination of the Licence + +The Licence and the rights granted hereunder will terminate automatically upon +any breach by the Licensee of the terms of the Licence. Such a termination +will not terminate the licences of any person who has received the Work from +the Licensee under the Licence, provided such persons remain in full +compliance with the Licence. + +13. Miscellaneous + +Without prejudice of Article 9 above, the Licence represents the complete +agreement between the Parties as to the Work. + +If any provision of the Licence is invalid or unenforceable under applicable +law, this will not affect the validity or enforceability of the Licence as a +whole. Such provision will be construed or reformed so as necessary to make it +valid and enforceable. + +The European Commission may publish other linguistic versions or new versions +of this Licence or updated versions of the Appendix, so far this is required +and reasonable, without reducing the scope of the rights granted by the +Licence. New versions of the Licence will be published with a unique version +number. + +All linguistic versions of this Licence, approved by the European Commission, +have identical value. Parties can take advantage of the linguistic version of +their choice. + +14. Jurisdiction + +Without prejudice to specific agreement between parties, +— any litigation resulting from the interpretation of this License, arising + between the European Union institutions, bodies, offices or agencies, as a + Licensor, and any Licensee, will be subject to the jurisdiction of the Court + of Justice of the European Union, as laid down in article 272 of the Treaty + on the Functioning of the European Union, +— any litigation arising between other parties and resulting from the + interpretation of this License, will be subject to the exclusive + jurisdiction of the competent court where the Licensor resides or conducts + its primary business. + +15. Applicable Law + +Without prejudice to specific agreement between parties, +— this Licence shall be governed by the law of the European Union Member State + where the Licensor has his seat, resides or has his registered office, +— this licence shall be governed by Belgian law if the Licensor has no seat, + residence or registered office inside a European Union Member State. + +Appendix + +‘Compatible Licences’ according to Article 5 EUPL are: +— GNU General Public License (GPL) v. 2, v. 3 +— GNU Affero General Public License (AGPL) v. 3 +— Open Software License (OSL) v. 2.1, v. 3.0 +— Eclipse Public License (EPL) v. 1.0 +— CeCILL v. 2.0, v. 2.1 +— Mozilla Public Licence (MPL) v. 2 +— GNU Lesser General Public Licence (LGPL) v. 2.1, v. 3 +— Creative Commons Attribution-ShareAlike v. 3.0 Unported (CC BY-SA 3.0) for + works other than software +— European Union Public Licence (EUPL) v. 1.1, v. 1.2 +— Québec Free and Open-Source Licence — Reciprocity (LiLiQ-R) or + Strong Reciprocity (LiLiQ-R+) + +— The European Commission may update this Appendix to later versions of the + above licences without producing a new version of the EUPL, as long as they + provide the rights granted in Article 2 of this Licence and protect the + covered Source Code from exclusive appropriation. +— All other changes or additions to this Appendix require the production of a + new EUPL version. +''', + ), + ( + 'GPL-2.0', + '''\ +GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. +''', + ), + ( + 'GPL-3.0', + '''\ +GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. +''', + ), + ( + 'ISC', + '''\ +ISC License + +Copyright (c) [year], [fullname] + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +''', + ), + ( + 'LGPL-2.1', + '''\ +GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 + USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random + Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! +''', + ), + ( + 'LGPL-3.0', + '''\ +GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. +''', + ), + ( + 'LPPL-1.3c', + '''\ +The LaTeX Project Public License +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +LPPL Version 1.3c 2008-05-04 + +Copyright 1999 2002-2008 LaTeX3 Project + Everyone is allowed to distribute verbatim copies of this + license document, but modification of it is not allowed. + + +PREAMBLE +======== + +The LaTeX Project Public License (LPPL) is the primary license under +which the LaTeX kernel and the base LaTeX packages are distributed. + +You may use this license for any work of which you hold the copyright +and which you wish to distribute. This license may be particularly +suitable if your work is TeX-related (such as a LaTeX package), but +it is written in such a way that you can use it even if your work is +unrelated to TeX. + +The section `WHETHER AND HOW TO DISTRIBUTE WORKS UNDER THIS LICENSE', +below, gives instructions, examples, and recommendations for authors +who are considering distributing their works under this license. + +This license gives conditions under which a work may be distributed +and modified, as well as conditions under which modified versions of +that work may be distributed. + +We, the LaTeX3 Project, believe that the conditions below give you +the freedom to make and distribute modified versions of your work +that conform with whatever technical specifications you wish while +maintaining the availability, integrity, and reliability of +that work. If you do not see how to achieve your goal while +meeting these conditions, then read the document `cfgguide.tex' +and `modguide.tex' in the base LaTeX distribution for suggestions. + + +DEFINITIONS +=========== + +In this license document the following terms are used: + + `Work' + Any work being distributed under this License. + + `Derived Work' + Any work that under any applicable law is derived from the Work. + + `Modification' + Any procedure that produces a Derived Work under any applicable + law -- for example, the production of a file containing an + original file associated with the Work or a significant portion of + such a file, either verbatim or with modifications and/or + translated into another language. + + `Modify' + To apply any procedure that produces a Derived Work under any + applicable law. + + `Distribution' + Making copies of the Work available from one person to another, in + whole or in part. Distribution includes (but is not limited to) + making any electronic components of the Work accessible by + file transfer protocols such as FTP or HTTP or by shared file + systems such as Sun's Network File System (NFS). + + `Compiled Work' + A version of the Work that has been processed into a form where it + is directly usable on a computer system. This processing may + include using installation facilities provided by the Work, + transformations of the Work, copying of components of the Work, or + other activities. Note that modification of any installation + facilities provided by the Work constitutes modification of the Work. + + `Current Maintainer' + A person or persons nominated as such within the Work. If there is + no such explicit nomination then it is the `Copyright Holder' under + any applicable law. + + `Base Interpreter' + A program or process that is normally needed for running or + interpreting a part or the whole of the Work. + + A Base Interpreter may depend on external components but these + are not considered part of the Base Interpreter provided that each + external component clearly identifies itself whenever it is used + interactively. Unless explicitly specified when applying the + license to the Work, the only applicable Base Interpreter is a + `LaTeX-Format' or in the case of files belonging to the + `LaTeX-format' a program implementing the `TeX language'. + + + +CONDITIONS ON DISTRIBUTION AND MODIFICATION +=========================================== + +1. Activities other than distribution and/or modification of the Work +are not covered by this license; they are outside its scope. In +particular, the act of running the Work is not restricted and no +requirements are made concerning any offers of support for the Work. + +2. You may distribute a complete, unmodified copy of the Work as you +received it. Distribution of only part of the Work is considered +modification of the Work, and no right to distribute such a Derived +Work may be assumed under the terms of this clause. + +3. You may distribute a Compiled Work that has been generated from a +complete, unmodified copy of the Work as distributed under Clause 2 +above, as long as that Compiled Work is distributed in such a way that +the recipients may install the Compiled Work on their system exactly +as it would have been installed if they generated a Compiled Work +directly from the Work. + +4. If you are the Current Maintainer of the Work, you may, without +restriction, modify the Work, thus creating a Derived Work. You may +also distribute the Derived Work without restriction, including +Compiled Works generated from the Derived Work. Derived Works +distributed in this manner by the Current Maintainer are considered to +be updated versions of the Work. + +5. If you are not the Current Maintainer of the Work, you may modify +your copy of the Work, thus creating a Derived Work based on the Work, +and compile this Derived Work, thus creating a Compiled Work based on +the Derived Work. + +6. If you are not the Current Maintainer of the Work, you may +distribute a Derived Work provided the following conditions are met +for every component of the Work unless that component clearly states +in the copyright notice that it is exempt from that condition. Only +the Current Maintainer is allowed to add such statements of exemption +to a component of the Work. + + a. If a component of this Derived Work can be a direct replacement + for a component of the Work when that component is used with the + Base Interpreter, then, wherever this component of the Work + identifies itself to the user when used interactively with that + Base Interpreter, the replacement component of this Derived Work + clearly and unambiguously identifies itself as a modified version + of this component to the user when used interactively with that + Base Interpreter. + + b. Every component of the Derived Work contains prominent notices + detailing the nature of the changes to that component, or a + prominent reference to another file that is distributed as part + of the Derived Work and that contains a complete and accurate log + of the changes. + + c. No information in the Derived Work implies that any persons, + including (but not limited to) the authors of the original version + of the Work, provide any support, including (but not limited to) + the reporting and handling of errors, to recipients of the + Derived Work unless those persons have stated explicitly that + they do provide such support for the Derived Work. + + d. You distribute at least one of the following with the Derived Work: + + 1. A complete, unmodified copy of the Work; + if your distribution of a modified component is made by + offering access to copy the modified component from a + designated place, then offering equivalent access to copy + the Work from the same or some similar place meets this + condition, even though third parties are not compelled to + copy the Work along with the modified component; + + 2. Information that is sufficient to obtain a complete, + unmodified copy of the Work. + +7. If you are not the Current Maintainer of the Work, you may +distribute a Compiled Work generated from a Derived Work, as long as +the Derived Work is distributed to all recipients of the Compiled +Work, and as long as the conditions of Clause 6, above, are met with +regard to the Derived Work. + +8. The conditions above are not intended to prohibit, and hence do not +apply to, the modification, by any method, of any component so that it +becomes identical to an updated version of that component of the Work as +it is distributed by the Current Maintainer under Clause 4, above. + +9. Distribution of the Work or any Derived Work in an alternative +format, where the Work or that Derived Work (in whole or in part) is +then produced by applying some process to that format, does not relax or +nullify any sections of this license as they pertain to the results of +applying that process. + +10. a. A Derived Work may be distributed under a different license + provided that license itself honors the conditions listed in + Clause 6 above, in regard to the Work, though it does not have + to honor the rest of the conditions in this license. + + b. If a Derived Work is distributed under a different license, that + Derived Work must provide sufficient documentation as part of + itself to allow each recipient of that Derived Work to honor the + restrictions in Clause 6 above, concerning changes from the Work. + +11. This license places no restrictions on works that are unrelated to +the Work, nor does this license place any restrictions on aggregating +such works with the Work by any means. + +12. Nothing in this license is intended to, or may be used to, prevent +complete compliance by all parties with all applicable laws. + + +NO WARRANTY +=========== + +There is no warranty for the Work. Except when otherwise stated in +writing, the Copyright Holder provides the Work `as is', without +warranty of any kind, either expressed or implied, including, but not +limited to, the implied warranties of merchantability and fitness for a +particular purpose. The entire risk as to the quality and performance +of the Work is with you. Should the Work prove defective, you assume +the cost of all necessary servicing, repair, or correction. + +In no event unless required by applicable law or agreed to in writing +will The Copyright Holder, or any author named in the components of the +Work, or any other party who may distribute and/or modify the Work as +permitted above, be liable to you for damages, including any general, +special, incidental or consequential damages arising out of any use of +the Work or out of inability to use the Work (including, but not limited +to, loss of data, data being rendered inaccurate, or losses sustained by +anyone as a result of any failure of the Work to operate with any other +programs), even if the Copyright Holder or said author or said other +party has been advised of the possibility of such damages. + + +MAINTENANCE OF THE WORK +======================= + +The Work has the status `author-maintained' if the Copyright Holder +explicitly and prominently states near the primary copyright notice in +the Work that the Work can only be maintained by the Copyright Holder +or simply that it is `author-maintained'. + +The Work has the status `maintained' if there is a Current Maintainer +who has indicated in the Work that they are willing to receive error +reports for the Work (for example, by supplying a valid e-mail +address). It is not required for the Current Maintainer to acknowledge +or act upon these error reports. + +The Work changes from status `maintained' to `unmaintained' if there +is no Current Maintainer, or the person stated to be Current +Maintainer of the work cannot be reached through the indicated means +of communication for a period of six months, and there are no other +significant signs of active maintenance. + +You can become the Current Maintainer of the Work by agreement with +any existing Current Maintainer to take over this role. + +If the Work is unmaintained, you can become the Current Maintainer of +the Work through the following steps: + + 1. Make a reasonable attempt to trace the Current Maintainer (and + the Copyright Holder, if the two differ) through the means of + an Internet or similar search. + + 2. If this search is successful, then enquire whether the Work + is still maintained. + + a. If it is being maintained, then ask the Current Maintainer + to update their communication data within one month. + + b. If the search is unsuccessful or no action to resume active + maintenance is taken by the Current Maintainer, then announce + within the pertinent community your intention to take over + maintenance. (If the Work is a LaTeX work, this could be + done, for example, by posting to comp.text.tex.) + + 3a. If the Current Maintainer is reachable and agrees to pass + maintenance of the Work to you, then this takes effect + immediately upon announcement. + + b. If the Current Maintainer is not reachable and the Copyright + Holder agrees that maintenance of the Work be passed to you, + then this takes effect immediately upon announcement. + + 4. If you make an `intention announcement' as described in 2b. above + and after three months your intention is challenged neither by + the Current Maintainer nor by the Copyright Holder nor by other + people, then you may arrange for the Work to be changed so as + to name you as the (new) Current Maintainer. + + 5. If the previously unreachable Current Maintainer becomes + reachable once more within three months of a change completed + under the terms of 3b) or 4), then that Current Maintainer must + become or remain the Current Maintainer upon request provided + they then update their communication data within one month. + +A change in the Current Maintainer does not, of itself, alter the fact +that the Work is distributed under the LPPL license. + +If you become the Current Maintainer of the Work, you should +immediately provide, within the Work, a prominent and unambiguous +statement of your status as Current Maintainer. You should also +announce your new status to the same pertinent community as +in 2b) above. + + +WHETHER AND HOW TO DISTRIBUTE WORKS UNDER THIS LICENSE +====================================================== + +This section contains important instructions, examples, and +recommendations for authors who are considering distributing their +works under this license. These authors are addressed as `you' in +this section. + +Choosing This License or Another License +---------------------------------------- + +If for any part of your work you want or need to use *distribution* +conditions that differ significantly from those in this license, then +do not refer to this license anywhere in your work but, instead, +distribute your work under a different license. You may use the text +of this license as a model for your own license, but your license +should not refer to the LPPL or otherwise give the impression that +your work is distributed under the LPPL. + +The document `modguide.tex' in the base LaTeX distribution explains +the motivation behind the conditions of this license. It explains, +for example, why distributing LaTeX under the GNU General Public +License (GPL) was considered inappropriate. Even if your work is +unrelated to LaTeX, the discussion in `modguide.tex' may still be +relevant, and authors intending to distribute their works under any +license are encouraged to read it. + +A Recommendation on Modification Without Distribution +----------------------------------------------------- + +It is wise never to modify a component of the Work, even for your own +personal use, without also meeting the above conditions for +distributing the modified component. While you might intend that such +modifications will never be distributed, often this will happen by +accident -- you may forget that you have modified that component; or +it may not occur to you when allowing others to access the modified +version that you are thus distributing it and violating the conditions +of this license in ways that could have legal implications and, worse, +cause problems for the community. It is therefore usually in your +best interest to keep your copy of the Work identical with the public +one. Many works provide ways to control the behavior of that work +without altering any of its licensed components. + +How to Use This License +----------------------- + +To use this license, place in each of the components of your work both +an explicit copyright notice including your name and the year the work +was authored and/or last substantially modified. Include also a +statement that the distribution and/or modification of that +component is constrained by the conditions in this license. + +Here is an example of such a notice and statement: + + %% pig.dtx + %% Copyright 2005 M. Y. Name + % + % This work may be distributed and/or modified under the + % conditions of the LaTeX Project Public License, either version 1.3 + % of this license or (at your option) any later version. + % The latest version of this license is in + % http://www.latex-project.org/lppl.txt + % and version 1.3 or later is part of all distributions of LaTeX + % version 2005/12/01 or later. + % + % This work has the LPPL maintenance status `maintained'. + % + % The Current Maintainer of this work is M. Y. Name. + % + % This work consists of the files pig.dtx and pig.ins + % and the derived file pig.sty. + +Given such a notice and statement in a file, the conditions +given in this license document would apply, with the `Work' referring +to the three files `pig.dtx', `pig.ins', and `pig.sty' (the last being +generated from `pig.dtx' using `pig.ins'), the `Base Interpreter' +referring to any `LaTeX-Format', and both `Copyright Holder' and +`Current Maintainer' referring to the person `M. Y. Name'. + +If you do not want the Maintenance section of LPPL to apply to your +Work, change `maintained' above into `author-maintained'. +However, we recommend that you use `maintained', as the Maintenance +section was added in order to ensure that your Work remains useful to +the community even when you can no longer maintain and support it +yourself. + +Derived Works That Are Not Replacements +--------------------------------------- + +Several clauses of the LPPL specify means to provide reliability and +stability for the user community. They therefore concern themselves +with the case that a Derived Work is intended to be used as a +(compatible or incompatible) replacement of the original Work. If +this is not the case (e.g., if a few lines of code are reused for a +completely different task), then clauses 6b and 6d shall not apply. + + +Important Recommendations +------------------------- + + Defining What Constitutes the Work + + The LPPL requires that distributions of the Work contain all the + files of the Work. It is therefore important that you provide a + way for the licensee to determine which files constitute the Work. + This could, for example, be achieved by explicitly listing all the + files of the Work near the copyright notice of each file or by + using a line such as: + + % This work consists of all files listed in manifest.txt. + + in that place. In the absence of an unequivocal list it might be + impossible for the licensee to determine what is considered by you + to comprise the Work and, in such a case, the licensee would be + entitled to make reasonable conjectures as to which files comprise + the Work. +''', + ), + ( + 'MIT', + '''\ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''', + ), + ( + 'MPL-2.0', + '''\ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. +''', + ), + ( + 'MS-PL', + '''\ +Microsoft Public License (Ms-PL) + +This license governs use of the accompanying software. If you use the +software, you accept this license. If you do not accept the license, do not +use the software. + +1. Definitions +The terms "reproduce," "reproduction," "derivative works," and "distribution" +have the same meaning here as under U.S. copyright law. A "contribution" is +the original software, or any additions or changes to the software. A +"contributor" is any person that distributes its contribution under this +license. "Licensed patents" are a contributor's patent claims that read +directly on its contribution. + +2. Grant of Rights + (A) Copyright Grant- Subject to the terms of this license, including the + license conditions and limitations in section 3, each contributor grants + you a non-exclusive, worldwide, royalty-free copyright license to + reproduce its contribution, prepare derivative works of its contribution, + and distribute its contribution or any derivative works that you create. + + (B) Patent Grant- Subject to the terms of this license, including the + license conditions and limitations in section 3, each contributor grants + you a non-exclusive, worldwide, royalty-free license under its licensed + patents to make, have made, use, sell, offer for sale, import, and/or + otherwise dispose of its contribution in the software or derivative works + of the contribution in the software. + +3. Conditions and Limitations + (A) No Trademark License- This license does not grant you rights to use + any contributors' name, logo, or trademarks. + + (B) If you bring a patent claim against any contributor over patents that + you claim are infringed by the software, your patent license from such + contributor to the software ends automatically. + + (C) If you distribute any portion of the software, you must retain all + copyright, patent, trademark, and attribution notices that are present in + the software. + + (D) If you distribute any portion of the software in source code form, + you may do so only under this license by including a complete copy of + this license with your distribution. If you distribute any portion of the + software in compiled or object code form, you may only do so under a + license that complies with this license. + + (E) The software is licensed "as-is." You bear the risk of using it. The + contributors give no express warranties, guarantees, or conditions. You + may have additional consumer rights under your local laws which this + license cannot change. To the extent permitted under your local laws, the + contributors exclude the implied warranties of merchantability, fitness + for a particular purpose and non-infringement. +''', + ), + ( + 'MS-RL', + '''\ +Microsoft Reciprocal License (Ms-RL) + +This license governs use of the accompanying software. If you use the +software, you accept this license. If you do not accept the license, do not +use the software. + +1. Definitions +The terms "reproduce," "reproduction," "derivative works," and "distribution" +have the same meaning here as under U.S. copyright law. + +A "contribution" is the original software, or any additions or changes to the +software. + +A "contributor" is any person that distributes its contribution under this +license. + +"Licensed patents" are a contributor's patent claims that read directly on its +contribution. + +2. Grant of Rights + (A) Copyright Grant- Subject to the terms of this license, including the + license conditions and limitations in section 3, each contributor grants + you a non-exclusive, worldwide, royalty-free copyright license to + reproduce its contribution, prepare derivative works of its contribution, + and distribute its contribution or any derivative works that you create. + + (B) Patent Grant- Subject to the terms of this license, including the + license conditions and limitations in section 3, each contributor grants + you a non-exclusive, worldwide, royalty-free license under its licensed + patents to make, have made, use, sell, offer for sale, import, and/or + otherwise dispose of its contribution in the software or derivative works + of the contribution in the software. + +3. Conditions and Limitations + (A) Reciprocal Grants- For any file you distribute that contains code + from the software (in source code or binary format), you must provide + recipients the source code to that file along with a copy of this + license, which license will govern that file. You may license other files + that are entirely your own work and do not contain code from the software + under any terms you choose. + + (B) No Trademark License- This license does not grant you rights to use + any contributors' name, logo, or trademarks. + + (C) If you bring a patent claim against any contributor over patents that + you claim are infringed by the software, your patent license from such + contributor to the software ends automatically. + + (D) If you distribute any portion of the software, you must retain all + copyright, patent, trademark, and attribution notices that are present in + the software. + + (E) If you distribute any portion of the software in source code form, + you may do so only under this license by including a complete copy of + this license with your distribution. If you distribute any portion of the + software in compiled or object code form, you may only do so under a + license that complies with this license. + + (F) The software is licensed "as-is." You bear the risk of using it. The + contributors give no express warranties, guarantees, or conditions. You + may have additional consumer rights under your local laws which this + license cannot change. To the extent permitted under your local laws, the + contributors exclude the implied warranties of merchantability, fitness + for a particular purpose and non-infringement. +''', + ), + ( + 'NCSA', + '''\ +University of Illinois/NCSA Open Source License +Copyright (c) [year] [fullname]. All rights reserved. +Developed by: [project] [fullname] [projecturl] + Permission is hereby granted, free of charge, to any personobtaining a copy of this software and associated documentation files(the "Software"), to deal with the Software without restriction,including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software,and to permit persons to whom the Software is furnished to do so,subject to the following conditions: + +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. + +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. + +* Neither the names of [fullname], [project] nor the names of its contributors may be used to endorse or promote products derived from + this Software without specific prior written permission. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESSOR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THECONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHERLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +THE SOFTWARE. +''', + ), + ( + 'OFL-1.1', + '''\ +Copyright (c) [year] [fullname] ([email]) + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION AND CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. +''', + ), + ( + 'OSL-3.0', + '''\ +Open Software License ("OSL") v 3.0 + +This Open Software License (the "License") applies to any original work of +authorship (the "Original Work") whose owner (the "Licensor") has placed the +following licensing notice adjacent to the copyright notice for the Original +Work: + +Licensed under the Open Software License version 3.0 + +1) Grant of Copyright License. Licensor grants You a worldwide, royalty-free, +non-exclusive, sublicensable license, for the duration of the copyright, to do +the following: + + a) to reproduce the Original Work in copies, either alone or as part of a + collective work; + + b) to translate, adapt, alter, transform, modify, or arrange the Original + Work, thereby creating derivative works ("Derivative Works") based upon the + Original Work; + + c) to distribute or communicate copies of the Original Work and Derivative + Works to the public, with the proviso that copies of Original Work or + Derivative Works that You distribute or communicate shall be licensed under + this Open Software License; + + d) to perform the Original Work publicly; and + + e) to display the Original Work publicly. + +2) Grant of Patent License. Licensor grants You a worldwide, royalty-free, +non-exclusive, sublicensable license, under patent claims owned or controlled +by the Licensor that are embodied in the Original Work as furnished by the +Licensor, for the duration of the patents, to make, use, sell, offer for sale, +have made, and import the Original Work and Derivative Works. + +3) Grant of Source Code License. The term "Source Code" means the preferred +form of the Original Work for making modifications to it and all available +documentation describing how to modify the Original Work. Licensor agrees to +provide a machine-readable copy of the Source Code of the Original Work along +with each copy of the Original Work that Licensor distributes. Licensor +reserves the right to satisfy this obligation by placing a machine-readable +copy of the Source Code in an information repository reasonably calculated to +permit inexpensive and convenient access by You for as long as Licensor +continues to distribute the Original Work. + +4) Exclusions From License Grant. Neither the names of Licensor, nor the names +of any contributors to the Original Work, nor any of their trademarks or +service marks, may be used to endorse or promote products derived from this +Original Work without express prior permission of the Licensor. Except as +expressly stated herein, nothing in this License grants any license to +Licensor's trademarks, copyrights, patents, trade secrets or any other +intellectual property. No patent license is granted to make, use, sell, offer +for sale, have made, or import embodiments of any patent claims other than the +licensed claims defined in Section 2. No license is granted to the trademarks +of Licensor even if such marks are included in the Original Work. Nothing in +this License shall be interpreted to prohibit Licensor from licensing under +terms different from this License any Original Work that Licensor otherwise +would have a right to license. + +5) External Deployment. The term "External Deployment" means the use, +distribution, or communication of the Original Work or Derivative Works in any +way such that the Original Work or Derivative Works may be used by anyone +other than You, whether those works are distributed or communicated to those +persons or made available as an application intended for use over a network. +As an express condition for the grants of license hereunder, You must treat +any External Deployment by You of the Original Work or a Derivative Work as a +distribution under section 1(c). + +6) Attribution Rights. You must retain, in the Source Code of any Derivative +Works that You create, all copyright, patent, or trademark notices from the +Source Code of the Original Work, as well as any notices of licensing and any +descriptive text identified therein as an "Attribution Notice." You must cause +the Source Code for any Derivative Works that You create to carry a prominent +Attribution Notice reasonably calculated to inform recipients that You have +modified the Original Work. + +7) Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that +the copyright in and to the Original Work and the patent rights granted herein +by Licensor are owned by the Licensor or are sublicensed to You under the +terms of this License with the permission of the contributor(s) of those +copyrights and patent rights. Except as expressly stated in the immediately +preceding sentence, the Original Work is provided under this License on an "AS +IS" BASIS and WITHOUT WARRANTY, either express or implied, including, without +limitation, the warranties of non-infringement, merchantability or fitness for +a particular purpose. THE ENTIRE RISK AS TO THE QUALITY OF THE ORIGINAL WORK +IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an essential part of this +License. No license to the Original Work is granted by this License except +under this disclaimer. + +8) Limitation of Liability. Under no circumstances and under no legal theory, +whether in tort (including negligence), contract, or otherwise, shall the +Licensor be liable to anyone for any indirect, special, incidental, or +consequential damages of any character arising as a result of this License or +the use of the Original Work including, without limitation, damages for loss +of goodwill, work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses. This limitation of liability shall not +apply to the extent applicable law prohibits such limitation. + +9) Acceptance and Termination. If, at any time, You expressly assented to this +License, that assent indicates your clear and irrevocable acceptance of this +License and all of its terms and conditions. If You distribute or communicate +copies of the Original Work or a Derivative Work, You must make a reasonable +effort under the circumstances to obtain the express assent of recipients to +the terms of this License. This License conditions your rights to undertake +the activities listed in Section 1, including your right to create Derivative +Works based upon the Original Work, and doing so without honoring these terms +and conditions is prohibited by copyright law and international treaty. +Nothing in this License is intended to affect copyright exceptions and +limitations (including "fair use" or "fair dealing"). This License shall +terminate immediately and You may no longer exercise any of the rights granted +to You by this License upon your failure to honor the conditions in Section +1(c). + +10) Termination for Patent Action. This License shall terminate automatically +and You may no longer exercise any of the rights granted to You by this +License as of the date You commence an action, including a cross-claim or +counterclaim, against Licensor or any licensee alleging that the Original Work +infringes a patent. This termination provision shall not apply for an action +alleging patent infringement by combinations of the Original Work with other +software or hardware. + +11) Jurisdiction, Venue and Governing Law. Any action or suit relating to this +License may be brought only in the courts of a jurisdiction wherein the +Licensor resides or in which Licensor conducts its primary business, and under +the laws of that jurisdiction excluding its conflict-of-law provisions. The +application of the United Nations Convention on Contracts for the +International Sale of Goods is expressly excluded. Any use of the Original +Work outside the scope of this License or after its termination shall be +subject to the requirements and penalties of copyright or patent law in the +appropriate jurisdiction. This section shall survive the termination of this +License. + +12) Attorneys' Fees. In any action to enforce the terms of this License or +seeking damages relating thereto, the prevailing party shall be entitled to +recover its costs and expenses, including, without limitation, reasonable +attorneys' fees and costs incurred in connection with such action, including +any appeal of such action. This section shall survive the termination of this +License. + +13) Miscellaneous. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent necessary +to make it enforceable. + +14) Definition of "You" in This License. "You" throughout this License, +whether in upper or lower case, means an individual or a legal entity +exercising rights under, and complying with all of the terms of, this License. +For legal entities, "You" includes any entity that controls, is controlled by, +or is under common control with you. For purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the direction or +management of such entity, whether by contract or otherwise, or (ii) ownership +of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial +ownership of such entity. + +15) Right to Use. You may use the Original Work in all ways not otherwise +restricted or conditioned by this License or by law, and Licensor promises not +to interfere with or be responsible for such uses by You. + +16) Modification of This License. This License is Copyright © 2005 Lawrence +Rosen. Permission is granted to copy, distribute, or communicate this License +without modification. Nothing in this License permits You to modify this +License as applied to the Original Work or to Derivative Works. However, You +may modify the text of this License and copy, distribute or communicate your +modified version (the "Modified License") and apply it to other original works +of authorship subject to the following conditions: (i) You may not indicate in +any way that your Modified License is the "Open Software License" or "OSL" and +you may not use those names in the name of your Modified License; (ii) You +must replace the notice specified in the first paragraph above with the notice +"Licensed under " or with a notice of your own +that is not confusingly similar to the notice in this License; and (iii) You +may not claim that your original works are open source software unless your +Modified License has been approved by Open Source Initiative (OSI) and You +comply with its license review and certification process. +''', + ), + ( + 'PostgreSQL', + '''\ +PostgreSQL License + +Copyright (c) [year], [fullname] + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement is +hereby granted, provided that the above copyright notice and this paragraph +and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL [fullname] BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, +SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING +OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF [fullname] +HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +[fullname] SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, +AND [fullname] HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, +ENHANCEMENTS, OR MODIFICATIONS. +''', + ), + ( + 'UPL-1.0', + '''\ +Copyright (c) [year] [fullname] + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associate documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a “Larger Work” to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +''', + ), + ( + 'Unlicense', + '''\ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to +''', + ), + ( + 'WTFPL', + '''\ +DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + Version 2, December 2004 + + Copyright (C) 2004 Sam Hocevar + + Everyone is permitted to copy and distribute verbatim or modified + copies of this license document, and changing it is allowed as long + as the name is changed. + + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. You just DO WHAT THE FUCK YOU WANT TO. +''', + ), + ( + 'Zlib', + '''\ +zlib License + +(C) [year] [fullname] + +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +''', + ), +) +``` + +### URLs + - `Homepage`: https://github.com/pre-commit/identify + + ## idna (3.11) ### Licenses @@ -5591,7 +14729,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source`: https://github.com/kjd/idna -## importlib-metadata (8.7.0) +## importlib_metadata (8.7.0) ### Licenses License: `Apache Software License` @@ -5806,7 +14944,73 @@ License: `Apache Software License` - `Source`: https://github.com/python/importlib_metadata -## jinja2 (3.1.6) +## iniconfig (2.3.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2010 - 2023 Holger Krekel and others + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/pytest-dev/iniconfig + + +## jieba (0.42.1) + +### Licenses +License: `MIT` + + - `LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2013 Sun Junyi + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.``` + +### URLs + - `Homepage`: https://github.com/fxsjy/jieba + + +## Jinja2 (3.1.6) ### Licenses License: `BSD License` @@ -5851,12 +15055,12 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source`: https://github.com/pallets/jinja/ -## jiter (0.11.0) +## jiter (0.12.0) ### Licenses -License: `MIT` +License: `MIT License` - - `LICENSE`: + - `licenses/LICENSE`: ``` The MIT License (MIT) @@ -5885,6 +15089,92 @@ SOFTWARE. - `Homepage`: https://github.com/pydantic/jiter/ +## joblib (1.5.2) + +### Licenses +License: `BSD 3-Clause` + + - `licenses/LICENSE.txt`: +``` +BSD 3-Clause License + +Copyright (c) 2008-2021, The joblib developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Homepage`: https://joblib.readthedocs.io + - `Source`: https://github.com/joblib/joblib + + +## jsonlines (4.0.0) + +### Licenses +License: `BSD` + + - `LICENSE.rst`: +``` +*(This is the OSI approved 3-clause "New BSD License".)* + +Copyright © 2016, wouter bolsterlee + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +* Neither the name of the author nor the names of the contributors may be used + to endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Homepage`: https://github.com/wbolster/jsonlines + + ## jsonschema (4.25.1) ### Licenses @@ -5892,7 +15182,7 @@ License: `MIT` - `licenses/COPYING`: ``` -Copyright (c) 2013 Julian Berman +Copyright (c) 2022 Julian Berman Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -5960,6 +15250,41 @@ THE SOFTWARE. - `Tidelift`: https://tidelift.com/subscription/pkg/pypi-jsonschema-specifications?utm_source=pypi-jsonschema-specifications&utm_medium=referral&utm_campaign=pypi-link +## kaleido (1.2.0) + +### Licenses +License: `The MIT License (MIT)` + + - `LICENSE.md`: +``` +The MIT License (MIT) + +Copyright (c) Plotly, Inc + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/plotly/kaleido + - `Repository`: https://github.com/plotly/kaleido + + ## kiwisolver (1.4.9) ### Licenses @@ -6047,7 +15372,7 @@ to indicate the copyright and license terms: - `repository`: https://github.com/nucleic/kiwi -## lark (1.3.0) +## lark (1.3.1) ### Licenses License: `MIT` @@ -6114,271 +15439,74 @@ License: `MIT` - `repository`: https://github.com/microsoft/llguidance -## llvmlite (0.45.1) +## lm_eval (0.4.8) ### Licenses -License: `BSD` +License: `MIT` - - `licenses/LICENSE.thirdparty`: + - `LICENSE.md`: ``` -The llvmlite source tree includes code from LLVM that is governed by the -following license. +MIT License -============================================================================== -The Apache License v2.0 with LLVM Exceptions: -============================================================================== +Copyright (c) 2020 EleutherAI - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - ----- LLVM Exceptions to the Apache 2.0 License ---- - -As an exception, if, as a result of your compiling your source code, portions -of this Software are embedded into an Object form of such source code, you -may redistribute such embedded portions in such Object form without complying -with the conditions of Sections 4(a), 4(b) and 4(d) of the License. - -In addition, if you combine or link compiled forms of this Software with -software that is licensed under the GPLv2 ("Combined Software") and if a -court of competent jurisdiction determines that the patent provision (Section -3), the indemnity provision (Section 9) or other Section of the License -conflicts with the conditions of the GPLv2, you may retroactively and -prospectively choose to deem waived or otherwise exclude such Section(s) of -the License, but only in their entirety and only with respect to the Combined -Software. -``` - - - `licenses/LICENSE`: -``` -Copyright (c) 2014-, Continuum Analytics, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. ``` ### URLs - - `Homepage`: http://llvmlite.readthedocs.io - - `Source`: https://github.com/numba/llvmlite + - `Homepage`: https://github.com/EleutherAI/lm-evaluation-harness + - `Repository`: https://github.com/EleutherAI/lm-evaluation-harness + + +## logistro (2.0.1) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2025 GeoPozo + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/geopozo/logistro + - `Repository`: https://github.com/geopozo/logistro ## lxml (6.0.2) @@ -6386,6 +15514,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ### Licenses License: `BSD-3-Clause` + - `licenses/LICENSES.txt`: +``` +lxml is copyright Infrae and distributed under the BSD license (see +doc/licenses/BSD.txt), with the following exceptions: + +Some code, such a selftest.py, selftest2.py and +src/lxml/_elementpath.py are derived from ElementTree and +cElementTree. See doc/licenses/elementtree.txt for the license text. + +lxml.cssselect and lxml.html are copyright Ian Bicking and distributed +under the BSD license (see doc/licenses/BSD.txt). + +test.py, the test-runner script, is GPL and copyright Shuttleworth +Foundation. See doc/licenses/GPL.txt. It is believed the unchanged +inclusion of test.py to run the unit test suite falls under the +"aggregation" clause of the GPL and thus does not affect the license +of the rest of the package. + +The isoschematron implementation uses several XSL and RelaxNG resources: + * The (XML syntax) RelaxNG schema for schematron, copyright International + Organization for Standardization (see + src/lxml/isoschematron/resources/rng/iso-schematron.rng for the license + text) + * The skeleton iso-schematron-xlt1 pure-xslt schematron implementation + xsl stylesheets, copyright Rick Jelliffe and Academia Sinica Computing + Center, Taiwan (see the xsl files here for the license text: + src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/) + * The xsd/rng schema schematron extraction xsl transformations are unlicensed + and copyright the respective authors as noted (see + src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl and + src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl) +``` + - `licenses/LICENSE.txt`: ``` BSD 3-Clause License @@ -6421,50 +15582,76 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` - - `licenses/LICENSES.txt`: -``` -lxml is copyright Infrae and distributed under the BSD license (see -doc/licenses/BSD.txt), with the following exceptions: - -Some code, such a selftest.py, selftest2.py and -src/lxml/_elementpath.py are derived from ElementTree and -cElementTree. See doc/licenses/elementtree.txt for the license text. - -lxml.cssselect and lxml.html are copyright Ian Bicking and distributed -under the BSD license (see doc/licenses/BSD.txt). - -test.py, the test-runner script, is GPL and copyright Shuttleworth -Foundation. See doc/licenses/GPL.txt. It is believed the unchanged -inclusion of test.py to run the unit test suite falls under the -"aggregation" clause of the GPL and thus does not affect the license -of the rest of the package. - -The isoschematron implementation uses several XSL and RelaxNG resources: - * The (XML syntax) RelaxNG schema for schematron, copyright International - Organization for Standardization (see - src/lxml/isoschematron/resources/rng/iso-schematron.rng for the license - text) - * The skeleton iso-schematron-xlt1 pure-xslt schematron implementation - xsl stylesheets, copyright Rick Jelliffe and Academia Sinica Computing - Center, Taiwan (see the xsl files here for the license text: - src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/) - * The xsd/rng schema schematron extraction xsl transformations are unlicensed - and copyright the respective authors as noted (see - src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl and - src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl) -``` - ### URLs - `Bug Tracker`: https://bugs.launchpad.net/lxml - `Homepage`: https://lxml.de/ - `Source`: https://github.com/lxml/lxml +## Mako (1.3.10) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Copyright 2006-2025 the Mako authors and contributors . + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Documentation`: https://docs.makotemplates.org + - `Homepage`: https://www.makotemplates.org/ + - `Issue Tracker`: https://github.com/sqlalchemy/mako + + ## markdown-it-py (4.0.0) ### Licenses License: `MIT License` + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2020 ExecutableBookProject + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + - `licenses/LICENSE.markdown-it`: ``` Copyright (c) 2014 Vitaly Puzrin, Alex Kocharin. @@ -6489,31 +15676,6 @@ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -``` - - - `licenses/LICENSE`: -``` -MIT License - -Copyright (c) 2020 ExecutableBookProject - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. ``` ### URLs @@ -6521,7 +15683,7 @@ SOFTWARE. - `Homepage`: https://github.com/executablebooks/markdown-it-py -## markupsafe (3.0.3) +## MarkupSafe (3.0.3) ### Licenses License: `BSD-3-Clause` @@ -6571,107 +15733,236 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ### Licenses License: `Python Software Foundation License` - - `LICENSE`: + - `mpl-data/fonts/ttf/LICENSE_DEJAVU`: ``` -License agreement for matplotlib versions 1.3.0 and later -========================================================= +Fonts are (c) Bitstream (see below). DejaVu changes are in public domain. +Glyphs imported from Arev fonts are (c) Tavmjong Bah (see below) -1. This LICENSE AGREEMENT is between the Matplotlib Development Team -("MDT"), and the Individual or Organization ("Licensee") accessing and -otherwise using matplotlib software in source or binary form and its -associated documentation. +Bitstream Vera Fonts Copyright +------------------------------ -2. Subject to the terms and conditions of this License Agreement, MDT -hereby grants Licensee a nonexclusive, royalty-free, world-wide license -to reproduce, analyze, test, perform and/or display publicly, prepare -derivative works, distribute, and otherwise use matplotlib -alone or in any derivative version, provided, however, that MDT's -License Agreement and MDT's notice of copyright, i.e., "Copyright (c) -2012- Matplotlib Development Team; All Rights Reserved" are retained in -matplotlib alone or in any derivative version prepared by -Licensee. +Copyright (c) 2003 by Bitstream, Inc. All Rights Reserved. Bitstream Vera is +a trademark of Bitstream, Inc. -3. In the event Licensee prepares a derivative work that is based on or -incorporates matplotlib or any part thereof, and wants to -make the derivative work available to others as provided herein, then -Licensee hereby agrees to include in any such work a brief summary of -the changes made to matplotlib . +Permission is hereby granted, free of charge, to any person obtaining a copy +of the fonts accompanying this license ("Fonts") and associated +documentation files (the "Font Software"), to reproduce and distribute the +Font Software, including without limitation the rights to use, copy, merge, +publish, distribute, and/or sell copies of the Font Software, and to permit +persons to whom the Font Software is furnished to do so, subject to the +following conditions: -4. MDT is making matplotlib available to Licensee on an "AS -IS" basis. MDT MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, MDT MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB -WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. +The above copyright and trademark notices and this permission notice shall +be included in all copies of one or more of the Font Software typefaces. -5. MDT SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB - FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR -LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING -MATPLOTLIB , OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF -THE POSSIBILITY THEREOF. +The Font Software may be modified, altered, or added to, and in particular +the designs of glyphs or characters in the Fonts may be modified and +additional glyphs or characters may be added to the Fonts, only if the fonts +are renamed to names not containing either the words "Bitstream" or the word +"Vera". -6. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. +This License becomes null and void to the extent applicable to Fonts or Font +Software that has been modified and is distributed under the "Bitstream +Vera" names. -7. Nothing in this License Agreement shall be deemed to create any -relationship of agency, partnership, or joint venture between MDT and -Licensee. This License Agreement does not grant permission to use MDT -trademarks or trade name in a trademark sense to endorse or promote -products or services of Licensee, or any third party. +The Font Software may be sold as part of a larger software package but no +copy of one or more of the Font Software typefaces may be sold by itself. -8. By copying, installing or otherwise using matplotlib , -Licensee agrees to be bound by the terms and conditions of this License -Agreement. +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, +TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL BITSTREAM OR THE GNOME +FOUNDATION BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING +ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF +THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE +FONT SOFTWARE. -License agreement for matplotlib versions prior to 1.3.0 -======================================================== +Except as contained in this notice, the names of Gnome, the Gnome +Foundation, and Bitstream Inc., shall not be used in advertising or +otherwise to promote the sale, use or other dealings in this Font Software +without prior written authorization from the Gnome Foundation or Bitstream +Inc., respectively. For further information, contact: fonts at gnome dot +org. -1. This LICENSE AGREEMENT is between John D. Hunter ("JDH"), and the -Individual or Organization ("Licensee") accessing and otherwise using -matplotlib software in source or binary form and its associated -documentation. +Arev Fonts Copyright +------------------------------ -2. Subject to the terms and conditions of this License Agreement, JDH -hereby grants Licensee a nonexclusive, royalty-free, world-wide license -to reproduce, analyze, test, perform and/or display publicly, prepare -derivative works, distribute, and otherwise use matplotlib -alone or in any derivative version, provided, however, that JDH's -License Agreement and JDH's notice of copyright, i.e., "Copyright (c) -2002-2011 John D. Hunter; All Rights Reserved" are retained in -matplotlib alone or in any derivative version prepared by -Licensee. +Copyright (c) 2006 by Tavmjong Bah. All Rights Reserved. -3. In the event Licensee prepares a derivative work that is based on or -incorporates matplotlib or any part thereof, and wants to -make the derivative work available to others as provided herein, then -Licensee hereby agrees to include in any such work a brief summary of -the changes made to matplotlib. +Permission is hereby granted, free of charge, to any person obtaining +a copy of the fonts accompanying this license ("Fonts") and +associated documentation files (the "Font Software"), to reproduce +and distribute the modifications to the Bitstream Vera Font Software, +including without limitation the rights to use, copy, merge, publish, +distribute, and/or sell copies of the Font Software, and to permit +persons to whom the Font Software is furnished to do so, subject to +the following conditions: -4. JDH is making matplotlib available to Licensee on an "AS -IS" basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB -WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. +The above copyright and trademark notices and this permission notice +shall be included in all copies of one or more of the Font Software +typefaces. -5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB - FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR -LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING -MATPLOTLIB , OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF -THE POSSIBILITY THEREOF. +The Font Software may be modified, altered, or added to, and in +particular the designs of glyphs or characters in the Fonts may be +modified and additional glyphs or characters may be added to the +Fonts, only if the fonts are renamed to names not containing either +the words "Tavmjong Bah" or the word "Arev". -6. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. +This License becomes null and void to the extent applicable to Fonts +or Font Software that has been modified and is distributed under the +"Tavmjong Bah Arev" names. -7. Nothing in this License Agreement shall be deemed to create any -relationship of agency, partnership, or joint venture between JDH and -Licensee. This License Agreement does not grant permission to use JDH -trademarks or trade name in a trademark sense to endorse or promote -products or services of Licensee, or any third party. +The Font Software may be sold as part of a larger software package but +no copy of one or more of the Font Software typefaces may be sold by +itself. -8. By copying, installing or otherwise using matplotlib, -Licensee agrees to be bound by the terms and conditions of this License -Agreement.``` +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL +TAVMJONG BAH BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. + +Except as contained in this notice, the name of Tavmjong Bah shall not +be used in advertising or otherwise to promote the sale, use or other +dealings in this Font Software without prior written authorization +from Tavmjong Bah. For further information, contact: tavmjong @ free +. fr. + +$Id: LICENSE 2133 2007-11-28 02:46:28Z lechimp $ +``` + + - `mpl-data/fonts/ttf/LICENSE_STIX`: +``` +The STIX fonts distributed with matplotlib have been modified from +their canonical form. They have been converted from OTF to TTF format +using Fontforge and this script: + + #!/usr/bin/env fontforge + i=1 + while ( i<$argc ) + Open($argv[i]) + Generate($argv[i]:r + ".ttf") + i = i+1 + endloop + +The original STIX Font License begins below. + +----------------------------------------------------------- + +STIX Font License + +24 May 2010 + +Copyright (c) 2001-2010 by the STI Pub Companies, consisting of the American +Institute of Physics, the American Chemical Society, the American Mathematical +Society, the American Physical Society, Elsevier, Inc., and The Institute of +Electrical and Electronic Engineers, Inc. (www.stixfonts.org), with Reserved +Font Name STIX Fonts, STIX Fonts (TM) is a trademark of The Institute of +Electrical and Electronics Engineers, Inc. + +Portions copyright (c) 1998-2003 by MicroPress, Inc. (www.micropress-inc.com), +with Reserved Font Name TM Math. To obtain additional mathematical fonts, please +contact MicroPress, Inc., 68-30 Harrow Street, Forest Hills, NY 11375, USA, +Phone: (718) 575-1816. + +Portions copyright (c) 1990 by Elsevier, Inc. + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +https://scripts.sil.org/OFL + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. +``` ### URLs - `Bug Tracker`: https://github.com/matplotlib/matplotlib/issues @@ -6683,6 +15974,43 @@ Agreement.``` - `Source Code`: https://github.com/matplotlib/matplotlib +## mbstrdecoder (1.1.4) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2016 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/mbstrdecoder/releases + - `Homepage`: https://github.com/thombashi/mbstrdecoder + - `Source`: https://github.com/thombashi/mbstrdecoder + - `Tracker`: https://github.com/thombashi/mbstrdecoder/issues + + ## mdurl (0.1.2) ### Licenses @@ -6958,7 +16286,7 @@ License: `Apache License, Version 2.0` - `Source`: https://github.com/mesonbuild/meson -## ml-dtypes (0.5.3) +## ml_dtypes (0.5.4) ### Licenses License: `Apache-2.0` @@ -7551,6 +16879,39 @@ Exhibit B - "Incompatible With Secondary Licenses" Notice - `repository`: https://github.com/jax-ml/ml_dtypes +## more-itertools (10.8.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Copyright (c) 2012 Erik Rose + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Documentation`: https://more-itertools.readthedocs.io/en/stable/ + - `Homepage`: https://github.com/more-itertools/more-itertools + + ## mpi4py (4.1.1) ### Licenses @@ -7678,38 +17039,6 @@ License: `Apache License 2.0` ### Licenses License: `BSD-3-Clause` - - `COPYING`: -``` -Copyright (c) 2006-2008, R Oudkerk - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. -3. Neither the name of author nor the names of any contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -SUCH DAMAGE. -``` - - `LICENSE`: ``` Copyright (c) 2008-2016 California Institute of Technology. @@ -7750,6 +17079,38 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `COPYING`: +``` +Copyright (c) 2006-2008, R Oudkerk + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. Neither the name of author nor the names of any contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. ``` ### URLs @@ -7760,7 +17121,91 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source Code`: https://github.com/uqfoundation/multiprocess -## narwhals (2.8.0) +## mypy (1.18.2) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Mypy extensions are licensed under the terms of the MIT license, reproduced below. + += = = = = + +The MIT License + +Copyright (c) 2016-2017 Jukka Lehtosalo and contributors + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + += = = = = +``` + +### URLs + - `Changelog`: https://github.com/python/mypy/blob/master/CHANGELOG.md + - `Documentation`: https://mypy.readthedocs.io/en/stable/index.html + - `Homepage`: https://www.mypy-lang.org/ + - `Issues`: https://github.com/python/mypy/issues + - `Repository`: https://github.com/python/mypy + + +## mypy_extensions (1.1.0) + +### Licenses +License: `None` + + - `licenses/LICENSE`: +``` +Mypy extensions are licensed under the terms of the MIT license, reproduced below. + += = = = = + +The MIT License + +Copyright (c) 2016-2017 Jukka Lehtosalo and contributors + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + += = = = = +``` + +### URLs + - `Homepage`: https://github.com/python/mypy_extensions + + +## narwhals (2.12.0) ### Licenses License: `MIT License` @@ -7797,10 +17242,10 @@ SOFTWARE. - `Repository`: https://github.com/narwhals-dev/narwhals -## networkx (3.5) +## networkx (3.6) ### Licenses -License: `BSD License` +License: `BSD-3-Clause` - `licenses/LICENSE.txt`: ``` @@ -8059,142 +17504,267 @@ third-party archives. - `Source Code`: https://github.com/scikit-build/ninja-python-distributions -## numba (0.62.1) +## nltk (3.9.2) + +### Licenses +License: `Apache License, Version 2.0` + + - `licenses/LICENSE.txt`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Documentation`: https://www.nltk.org/ + - `Homepage`: https://www.nltk.org/ + - `Issue Tracker`: https://github.com/nltk/nltk/issues + - `Source Code`: https://github.com/nltk/nltk + + +## nodeenv (1.9.1) ### Licenses License: `BSD` - - `licenses/LICENSE`: + - `LICENSE`: ``` -Copyright (c) 2012, Anaconda, Inc. -Copyright (c) 2024, NVIDIA CORPORATION. -All rights reserved. +Copyright (c) 2011, Eugene Kalinin. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: +Some rights reserved. -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. +Redistribution and use in source and binary forms of the software as well +as documentation, with or without modification, are permitted provided +that the following conditions are met: -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -``` +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. - - `licenses/LICENSE.numba`: -``` -Copyright (c) 2012, Anaconda, Inc. -All rights reserved. +* The names of the contributors may not be used to endorse or + promote products derived from this software without specific + prior written permission. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +THIS SOFTWARE AND DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT +NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE AND DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. ``` ### URLs - - `Homepage`: https://numba.pydata.org - - -## numba-cuda (0.20.0) - -### Licenses -License: `BSD-2-Clause` - - - `licenses/LICENSE`: -``` -Copyright (c) 2012, Anaconda, Inc. -Copyright (c) 2024, NVIDIA CORPORATION. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -``` - - - `licenses/LICENSE.numba`: -``` -Copyright (c) 2012, Anaconda, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -``` - -### URLs - - `Documentation`: https://nvidia.github.io/numba-cuda/ - - `Homepage`: https://nvidia.github.io/numba-cuda/ - - `Issues`: https://github.com/NVIDIA/numba-cuda/issues - - `License`: https://github.com/NVIDIA/numba-cuda/blob/main/LICENSE - - `Repository`: https://github.com/NVIDIA/numba-cuda + - `Homepage`: https://github.com/ekalinin/nodeenv ## numexpr (2.13.1) @@ -9222,3174 +18792,12 @@ License: LGPL-2.1-or-later - `Tracker`: https://github.com/numpy/numpy/issues -## nvidia-cublas-cu12 (12.8.4.1) - -### Licenses -License: `NVIDIA Proprietary Software` - - - `License.txt`: -``` -End User License Agreement --------------------------- - - -Preface -------- - -The Software License Agreement in Chapter 1 and the Supplement -in Chapter 2 contain license terms and conditions that govern -the use of NVIDIA software. By accepting this agreement, you -agree to comply with all the terms and conditions applicable -to the product(s) included herein. - - -NVIDIA Driver - - -Description - -This package contains the operating system driver and -fundamental system software components for NVIDIA GPUs. - - -NVIDIA CUDA Toolkit - - -Description - -The NVIDIA CUDA Toolkit provides command-line and graphical -tools for building, debugging and optimizing the performance -of applications accelerated by NVIDIA GPUs, runtime and math -libraries, and documentation including programming guides, -user manuals, and API references. - - -Default Install Location of CUDA Toolkit - -Windows platform: - -%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.# - -Linux platform: - -/usr/local/cuda-#.# - -Mac platform: - -/Developer/NVIDIA/CUDA-#.# - - -NVIDIA CUDA Samples - - -Description - -This package includes over 100+ CUDA examples that demonstrate -various CUDA programming principles, and efficient CUDA -implementation of algorithms in specific application domains. - - -Default Install Location of CUDA Samples - -Windows platform: - -%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.# - -Linux platform: - -/usr/local/cuda-#.#/samples - -and - -$HOME/NVIDIA_CUDA-#.#_Samples - -Mac platform: - -/Developer/NVIDIA/CUDA-#.#/samples - - -NVIDIA Nsight Visual Studio Edition (Windows only) - - -Description - -NVIDIA Nsight Development Platform, Visual Studio Edition is a -development environment integrated into Microsoft Visual -Studio that provides tools for debugging, profiling, analyzing -and optimizing your GPU computing and graphics applications. - - -Default Install Location of Nsight Visual Studio Edition - -Windows platform: - -%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.# - - -1. License Agreement for NVIDIA Software Development Kits ---------------------------------------------------------- - - -Release Date: July 26, 2018 ---------------------------- - - -Important NoticeRead before downloading, installing, -copying or using the licensed software: -------------------------------------------------------- - -This license agreement, including exhibits attached -("Agreement”) is a legal agreement between you and NVIDIA -Corporation ("NVIDIA") and governs your use of a NVIDIA -software development kit (“SDK”). - -Each SDK has its own set of software and materials, but here -is a description of the types of items that may be included in -a SDK: source code, header files, APIs, data sets and assets -(examples include images, textures, models, scenes, videos, -native API input/output files), binary software, sample code, -libraries, utility programs, programming code and -documentation. - -This Agreement can be accepted only by an adult of legal age -of majority in the country in which the SDK is used. - -If you are entering into this Agreement on behalf of a company -or other legal entity, you represent that you have the legal -authority to bind the entity to this Agreement, in which case -“you” will mean the entity you represent. - -If you don’t have the required age or authority to accept -this Agreement, or if you don’t accept all the terms and -conditions of this Agreement, do not download, install or use -the SDK. - -You agree to use the SDK only for purposes that are permitted -by (a) this Agreement, and (b) any applicable law, regulation -or generally accepted practices or guidelines in the relevant -jurisdictions. - - -1.1. License - - -1.1.1. License Grant - -Subject to the terms of this Agreement, NVIDIA hereby grants -you a non-exclusive, non-transferable license, without the -right to sublicense (except as expressly provided in this -Agreement) to: - - 1. Install and use the SDK, - - 2. Modify and create derivative works of sample source code - delivered in the SDK, and - - 3. Distribute those portions of the SDK that are identified - in this Agreement as distributable, as incorporated in - object code format into a software application that meets - the distribution requirements indicated in this Agreement. - - -1.1.2. Distribution Requirements - -These are the distribution requirements for you to exercise -the distribution grant: - - 1. Your application must have material additional - functionality, beyond the included portions of the SDK. - - 2. The distributable portions of the SDK shall only be - accessed by your application. - - 3. The following notice shall be included in modifications - and derivative works of sample source code distributed: - “This software contains source code provided by NVIDIA - Corporation.” - - 4. Unless a developer tool is identified in this Agreement - as distributable, it is delivered for your internal use - only. - - 5. The terms under which you distribute your application - must be consistent with the terms of this Agreement, - including (without limitation) terms relating to the - license grant and license restrictions and protection of - NVIDIA’s intellectual property rights. Additionally, you - agree that you will protect the privacy, security and - legal rights of your application users. - - 6. You agree to notify NVIDIA in writing of any known or - suspected distribution or use of the SDK not in compliance - with the requirements of this Agreement, and to enforce - the terms of your agreements with respect to distributed - SDK. - - -1.1.3. Authorized Users - -You may allow employees and contractors of your entity or of -your subsidiary(ies) to access and use the SDK from your -secure network to perform work on your behalf. - -If you are an academic institution you may allow users -enrolled or employed by the academic institution to access and -use the SDK from your secure network. - -You are responsible for the compliance with the terms of this -Agreement by your authorized users. If you become aware that -your authorized users didn’t follow the terms of this -Agreement, you agree to take reasonable steps to resolve the -non-compliance and prevent new occurrences. - - -1.1.4. Pre-Release SDK - -The SDK versions identified as alpha, beta, preview or -otherwise as pre-release, may not be fully functional, may -contain errors or design flaws, and may have reduced or -different security, privacy, accessibility, availability, and -reliability standards relative to commercial versions of -NVIDIA software and materials. Use of a pre-release SDK may -result in unexpected results, loss of data, project delays or -other unpredictable damage or loss. - -You may use a pre-release SDK at your own risk, understanding -that pre-release SDKs are not intended for use in production -or business-critical systems. - -NVIDIA may choose not to make available a commercial version -of any pre-release SDK. NVIDIA may also choose to abandon -development and terminate the availability of a pre-release -SDK at any time without liability. - - -1.1.5. Updates - -NVIDIA may, at its option, make available patches, workarounds -or other updates to this SDK. Unless the updates are provided -with their separate governing terms, they are deemed part of -the SDK licensed to you as provided in this Agreement. You -agree that the form and content of the SDK that NVIDIA -provides may change without prior notice to you. While NVIDIA -generally maintains compatibility between versions, NVIDIA may -in some cases make changes that introduce incompatibilities in -future versions of the SDK. - - -1.1.6. Third Party Licenses - -The SDK may come bundled with, or otherwise include or be -distributed with, third party software licensed by a NVIDIA -supplier and/or open source software provided under an open -source license. Use of third party software is subject to the -third-party license terms, or in the absence of third party -terms, the terms of this Agreement. Copyright to third party -software is held by the copyright holders indicated in the -third-party software or license. - - -1.1.7. Reservation of Rights - -NVIDIA reserves all rights, title, and interest in and to the -SDK, not expressly granted to you under this Agreement. - - -1.2. Limitations - -The following license limitations apply to your use of the -SDK: - - 1. You may not reverse engineer, decompile or disassemble, - or remove copyright or other proprietary notices from any - portion of the SDK or copies of the SDK. - - 2. Except as expressly provided in this Agreement, you may - not copy, sell, rent, sublicense, transfer, distribute, - modify, or create derivative works of any portion of the - SDK. For clarity, you may not distribute or sublicense the - SDK as a stand-alone product. - - 3. Unless you have an agreement with NVIDIA for this - purpose, you may not indicate that an application created - with the SDK is sponsored or endorsed by NVIDIA. - - 4. You may not bypass, disable, or circumvent any - encryption, security, digital rights management or - authentication mechanism in the SDK. - - 5. You may not use the SDK in any manner that would cause it - to become subject to an open source software license. As - examples, licenses that require as a condition of use, - modification, and/or distribution that the SDK be: - - a. Disclosed or distributed in source code form; - - b. Licensed for the purpose of making derivative works; - or - - c. Redistributable at no charge. - - 6. Unless you have an agreement with NVIDIA for this - purpose, you may not use the SDK with any system or - application where the use or failure of the system or - application can reasonably be expected to threaten or - result in personal injury, death, or catastrophic loss. - Examples include use in avionics, navigation, military, - medical, life support or other life critical applications. - NVIDIA does not design, test or manufacture the SDK for - these critical uses and NVIDIA shall not be liable to you - or any third party, in whole or in part, for any claims or - damages arising from such uses. - - 7. You agree to defend, indemnify and hold harmless NVIDIA - and its affiliates, and their respective employees, - contractors, agents, officers and directors, from and - against any and all claims, damages, obligations, losses, - liabilities, costs or debt, fines, restitutions and - expenses (including but not limited to attorney’s fees - and costs incident to establishing the right of - indemnification) arising out of or related to your use of - the SDK outside of the scope of this Agreement, or not in - compliance with its terms. - - -1.3. Ownership - - 1. NVIDIA or its licensors hold all rights, title and - interest in and to the SDK and its modifications and - derivative works, including their respective intellectual - property rights, subject to your rights described in this - section. This SDK may include software and materials from - NVIDIA’s licensors, and these licensors are intended - third party beneficiaries that may enforce this Agreement - with respect to their intellectual property rights. - - 2. You hold all rights, title and interest in and to your - applications and your derivative works of the sample - source code delivered in the SDK, including their - respective intellectual property rights, subject to - NVIDIA’s rights described in this section. - - 3. You may, but don’t have to, provide to NVIDIA - suggestions, feature requests or other feedback regarding - the SDK, including possible enhancements or modifications - to the SDK. For any feedback that you voluntarily provide, - you hereby grant NVIDIA and its affiliates a perpetual, - non-exclusive, worldwide, irrevocable license to use, - reproduce, modify, license, sublicense (through multiple - tiers of sublicensees), and distribute (through multiple - tiers of distributors) it without the payment of any - royalties or fees to you. NVIDIA will use feedback at its - choice. NVIDIA is constantly looking for ways to improve - its products, so you may send feedback to NVIDIA through - the developer portal at https://developer.nvidia.com. - - -1.4. No Warranties - -THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL -FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND -ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND -OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, -BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE -ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO -WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF -DEALING OR COURSE OF TRADE. - - -1.5. Limitation of Liability - -TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS -AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, -PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS -OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF -PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION -WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, -WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH -OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), -PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF -LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES -TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS -AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE -NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS -LIMIT. - -These exclusions and limitations of liability shall apply -regardless if NVIDIA or its affiliates have been advised of -the possibility of such damages, and regardless of whether a -remedy fails its essential purpose. These exclusions and -limitations of liability form an essential basis of the -bargain between the parties, and, absent any of these -exclusions or limitations of liability, the provisions of this -Agreement, including, without limitation, the economic terms, -would be substantially different. - - -1.6. Termination - - 1. This Agreement will continue to apply until terminated by - either you or NVIDIA as described below. - - 2. If you want to terminate this Agreement, you may do so by - stopping to use the SDK. - - 3. NVIDIA may, at any time, terminate this Agreement if: - - a. (i) you fail to comply with any term of this - Agreement and the non-compliance is not fixed within - thirty (30) days following notice from NVIDIA (or - immediately if you violate NVIDIA’s intellectual - property rights); - - b. (ii) you commence or participate in any legal - proceeding against NVIDIA with respect to the SDK; or - - c. (iii) NVIDIA decides to no longer provide the SDK in - a country or, in NVIDIA’s sole discretion, the - continued use of it is no longer commercially viable. - - 4. Upon any termination of this Agreement, you agree to - promptly discontinue use of the SDK and destroy all copies - in your possession or control. Your prior distributions in - accordance with this Agreement are not affected by the - termination of this Agreement. Upon written request, you - will certify in writing that you have complied with your - commitments under this section. Upon any termination of - this Agreement all provisions survive except for the - license grant provisions. - - -1.7. General - -If you wish to assign this Agreement or your rights and -obligations, including by merger, consolidation, dissolution -or operation of law, contact NVIDIA to ask for permission. Any -attempted assignment not approved by NVIDIA in writing shall -be void and of no effect. NVIDIA may assign, delegate or -transfer this Agreement and its rights and obligations, and if -to a non-affiliate you will be notified. - -You agree to cooperate with NVIDIA and provide reasonably -requested information to verify your compliance with this -Agreement. - -This Agreement will be governed in all respects by the laws of -the United States and of the State of Delaware as those laws -are applied to contracts entered into and performed entirely -within Delaware by Delaware residents, without regard to the -conflicts of laws principles. The United Nations Convention on -Contracts for the International Sale of Goods is specifically -disclaimed. You agree to all terms of this Agreement in the -English language. - -The state or federal courts residing in Santa Clara County, -California shall have exclusive jurisdiction over any dispute -or claim arising out of this Agreement. Notwithstanding this, -you agree that NVIDIA shall still be allowed to apply for -injunctive remedies or an equivalent type of urgent legal -relief in any jurisdiction. - -If any court of competent jurisdiction determines that any -provision of this Agreement is illegal, invalid or -unenforceable, such provision will be construed as limited to -the extent necessary to be consistent with and fully -enforceable under the law and the remaining provisions will -remain in full force and effect. Unless otherwise specified, -remedies are cumulative. - -Each party acknowledges and agrees that the other is an -independent contractor in the performance of this Agreement. - -The SDK has been developed entirely at private expense and is -“commercial items” consisting of “commercial computer -software” and “commercial computer software -documentation” provided with RESTRICTED RIGHTS. Use, -duplication or disclosure by the U.S. Government or a U.S. -Government subcontractor is subject to the restrictions in -this Agreement pursuant to DFARS 227.7202-3(a) or as set forth -in subparagraphs (c)(1) and (2) of the Commercial Computer -Software - Restricted Rights clause at FAR 52.227-19, as -applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas -Expressway, Santa Clara, CA 95051. - -The SDK is subject to United States export laws and -regulations. You agree that you will not ship, transfer or -export the SDK into any country, or use the SDK in any manner, -prohibited by the United States Bureau of Industry and -Security or economic sanctions regulations administered by the -U.S. Department of Treasury’s Office of Foreign Assets -Control (OFAC), or any applicable export laws, restrictions or -regulations. These laws include restrictions on destinations, -end users and end use. By accepting this Agreement, you -confirm that you are not a resident or citizen of any country -currently embargoed by the U.S. and that you are not otherwise -prohibited from receiving the SDK. - -Any notice delivered by NVIDIA to you under this Agreement -will be delivered via mail, email or fax. You agree that any -notices that NVIDIA sends you electronically will satisfy any -legal communication requirements. Please direct your legal -notices or other correspondence to NVIDIA Corporation, 2788 -San Tomas Expressway, Santa Clara, California 95051, United -States of America, Attention: Legal Department. - -This Agreement and any exhibits incorporated into this -Agreement constitute the entire agreement of the parties with -respect to the subject matter of this Agreement and supersede -all prior negotiations or documentation exchanged between the -parties relating to this SDK license. Any additional and/or -conflicting terms on documents issued by you are null, void, -and invalid. Any amendment or waiver under this Agreement -shall be in writing and signed by representatives of both -parties. - - -2. CUDA Toolkit Supplement to Software License Agreement for -NVIDIA Software Development Kits ------------------------------------------------------------- - - -Release date: August 16, 2018 ------------------------------ - -The terms in this supplement govern your use of the NVIDIA -CUDA Toolkit SDK under the terms of your license agreement -(“Agreement”) as modified by this supplement. Capitalized -terms used but not defined below have the meaning assigned to -them in the Agreement. - -This supplement is an exhibit to the Agreement and is -incorporated as an integral part of the Agreement. In the -event of conflict between the terms in this supplement and the -terms in the Agreement, the terms in this supplement govern. - - -2.1. License Scope - -The SDK is licensed for you to develop applications only for -use in systems with NVIDIA GPUs. - - -2.2. Distribution - -The portions of the SDK that are distributable under the -Agreement are listed in Attachment A. - - -2.3. Operating Systems - -Those portions of the SDK designed exclusively for use on the -Linux or FreeBSD operating systems, or other operating systems -derived from the source code to these operating systems, may -be copied and redistributed for use in accordance with this -Agreement, provided that the object code files are not -modified in any way (except for unzipping of compressed -files). - - -2.4. Audio and Video Encoders and Decoders - -You acknowledge and agree that it is your sole responsibility -to obtain any additional third-party licenses required to -make, have made, use, have used, sell, import, and offer for -sale your products or services that include or incorporate any -third-party software and content relating to audio and/or -video encoders and decoders from, including but not limited -to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A., -MPEG-LA, and Coding Technologies. NVIDIA does not grant to you -under this Agreement any necessary patent or other rights with -respect to any audio and/or video encoders and decoders. - - -2.5. Licensing - -If the distribution terms in this Agreement are not suitable -for your organization, or for any questions regarding this -Agreement, please contact NVIDIA at -nvidia-compute-license-questions@nvidia.com. - - -2.6. Attachment A - -The following portions of the SDK are distributable under the -Agreement: - -Component - -CUDA Runtime - -Windows - -cudart.dll, cudart_static.lib, cudadevrt.lib - -Mac OSX - -libcudart.dylib, libcudart_static.a, libcudadevrt.a - -Linux - -libcudart.so, libcudart_static.a, libcudadevrt.a - -Android - -libcudart.so, libcudart_static.a, libcudadevrt.a - -Component - -CUDA FFT Library - -Windows - -cufft.dll, cufftw.dll, cufft.lib, cufftw.lib - -Mac OSX - -libcufft.dylib, libcufft_static.a, libcufftw.dylib, -libcufftw_static.a - -Linux - -libcufft.so, libcufft_static.a, libcufftw.so, -libcufftw_static.a - -Android - -libcufft.so, libcufft_static.a, libcufftw.so, -libcufftw_static.a - -Component - -CUDA BLAS Library - -Windows - -cublas.dll, cublasLt.dll - -Mac OSX - -libcublas.dylib, libcublasLt.dylib, libcublas_static.a, -libcublasLt_static.a - -Linux - -libcublas.so, libcublasLt.so, libcublas_static.a, -libcublasLt_static.a - -Android - -libcublas.so, libcublasLt.so, libcublas_static.a, -libcublasLt_static.a - -Component - -NVIDIA "Drop-in" BLAS Library - -Windows - -nvblas.dll - -Mac OSX - -libnvblas.dylib - -Linux - -libnvblas.so - -Component - -CUDA Sparse Matrix Library - -Windows - -cusparse.dll, cusparse.lib - -Mac OSX - -libcusparse.dylib, libcusparse_static.a - -Linux - -libcusparse.so, libcusparse_static.a - -Android - -libcusparse.so, libcusparse_static.a - -Component - -CUDA Linear Solver Library - -Windows - -cusolver.dll, cusolver.lib - -Mac OSX - -libcusolver.dylib, libcusolver_static.a - -Linux - -libcusolver.so, libcusolver_static.a - -Android - -libcusolver.so, libcusolver_static.a - -Component - -CUDA Random Number Generation Library - -Windows - -curand.dll, curand.lib - -Mac OSX - -libcurand.dylib, libcurand_static.a - -Linux - -libcurand.so, libcurand_static.a - -Android - -libcurand.so, libcurand_static.a - -Component - -CUDA Accelerated Graph Library - -Component - -NVIDIA Performance Primitives Library - -Windows - -nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll, -nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll, -nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib, -nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll, -nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib - -Mac OSX - -libnppc.dylib, libnppc_static.a, libnppial.dylib, -libnppial_static.a, libnppicc.dylib, libnppicc_static.a, -libnppicom.dylib, libnppicom_static.a, libnppidei.dylib, -libnppidei_static.a, libnppif.dylib, libnppif_static.a, -libnppig.dylib, libnppig_static.a, libnppim.dylib, -libnppisu_static.a, libnppitc.dylib, libnppitc_static.a, -libnpps.dylib, libnpps_static.a - -Linux - -libnppc.so, libnppc_static.a, libnppial.so, -libnppial_static.a, libnppicc.so, libnppicc_static.a, -libnppicom.so, libnppicom_static.a, libnppidei.so, -libnppidei_static.a, libnppif.so, libnppif_static.a -libnppig.so, libnppig_static.a, libnppim.so, -libnppim_static.a, libnppist.so, libnppist_static.a, -libnppisu.so, libnppisu_static.a, libnppitc.so -libnppitc_static.a, libnpps.so, libnpps_static.a - -Android - -libnppc.so, libnppc_static.a, libnppial.so, -libnppial_static.a, libnppicc.so, libnppicc_static.a, -libnppicom.so, libnppicom_static.a, libnppidei.so, -libnppidei_static.a, libnppif.so, libnppif_static.a -libnppig.so, libnppig_static.a, libnppim.so, -libnppim_static.a, libnppist.so, libnppist_static.a, -libnppisu.so, libnppisu_static.a, libnppitc.so -libnppitc_static.a, libnpps.so, libnpps_static.a - -Component - -NVIDIA JPEG Library - -Linux - -libnvjpeg.so, libnvjpeg_static.a - -Component - -Internal common library required for statically linking to -cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP - -Mac OSX - -libculibos.a - -Linux - -libculibos.a - -Component - -NVIDIA Runtime Compilation Library and Header - -All - -nvrtc.h - -Windows - -nvrtc.dll, nvrtc-builtins.dll - -Mac OSX - -libnvrtc.dylib, libnvrtc-builtins.dylib - -Linux - -libnvrtc.so, libnvrtc-builtins.so - -Component - -NVIDIA Optimizing Compiler Library - -Windows - -nvvm.dll - -Mac OSX - -libnvvm.dylib - -Linux - -libnvvm.so - -Component - -NVIDIA Common Device Math Functions Library - -Windows - -libdevice.10.bc - -Mac OSX - -libdevice.10.bc - -Linux - -libdevice.10.bc - -Component - -CUDA Occupancy Calculation Header Library - -All - -cuda_occupancy.h - -Component - -CUDA Half Precision Headers - -All - -cuda_fp16.h, cuda_fp16.hpp - -Component - -CUDA Profiling Tools Interface (CUPTI) Library - -Windows - -cupti.dll - -Mac OSX - -libcupti.dylib - -Linux - -libcupti.so - -Component - -NVIDIA Tools Extension Library - -Windows - -nvToolsExt.dll, nvToolsExt.lib - -Mac OSX - -libnvToolsExt.dylib - -Linux - -libnvToolsExt.so - -Component - -NVIDIA CUDA Driver Libraries - -Linux - -libcuda.so, libnvidia-fatbinaryloader.so, -libnvidia-ptxjitcompiler.so - -The NVIDIA CUDA Driver Libraries are only distributable in -applications that meet this criteria: - - 1. The application was developed starting from a NVIDIA CUDA - container obtained from Docker Hub or the NVIDIA GPU - Cloud, and - - 2. The resulting application is packaged as a Docker - container and distributed to users on Docker Hub or the - NVIDIA GPU Cloud only. - - -2.7. Attachment B - - -Additional Licensing Obligations - -The following third party components included in the SOFTWARE -are licensed to Licensee pursuant to the following terms and -conditions: - - 1. Licensee's use of the GDB third party component is - subject to the terms and conditions of GNU GPL v3: - - This product includes copyrighted third-party software licensed - under the terms of the GNU General Public License v3 ("GPL v3"). - All third-party software packages are copyright by their respective - authors. GPL v3 terms and conditions are hereby incorporated into - the Agreement by this reference: http://www.gnu.org/licenses/gpl.txt - - Consistent with these licensing requirements, the software - listed below is provided under the terms of the specified - open source software licenses. To obtain source code for - software provided under licenses that require - redistribution of source code, including the GNU General - Public License (GPL) and GNU Lesser General Public License - (LGPL), contact oss-requests@nvidia.com. This offer is - valid for a period of three (3) years from the date of the - distribution of this product by NVIDIA CORPORATION. - - Component License - CUDA-GDB GPL v3 - - 2. Licensee represents and warrants that any and all third - party licensing and/or royalty payment obligations in - connection with Licensee's use of the H.264 video codecs - are solely the responsibility of Licensee. - - 3. Licensee's use of the Thrust library is subject to the - terms and conditions of the Apache License Version 2.0. - All third-party software packages are copyright by their - respective authors. Apache License Version 2.0 terms and - conditions are hereby incorporated into the Agreement by - this reference. - http://www.apache.org/licenses/LICENSE-2.0.html - - In addition, Licensee acknowledges the following notice: - Thrust includes source code from the Boost Iterator, - Tuple, System, and Random Number libraries. - - Boost Software License - Version 1.0 - August 17th, 2003 - . . . . - - Permission is hereby granted, free of charge, to any person or - organization obtaining a copy of the software and accompanying - documentation covered by this license (the "Software") to use, - reproduce, display, distribute, execute, and transmit the Software, - and to prepare derivative works of the Software, and to permit - third-parties to whom the Software is furnished to do so, all - subject to the following: - - The copyright notices in the Software and this entire statement, - including the above license grant, this restriction and the following - disclaimer, must be included in all copies of the Software, in whole - or in part, and all derivative works of the Software, unless such - copies or derivative works are solely in the form of machine-executable - object code generated by a source language processor. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND - NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR - ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR - OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - OTHER DEALINGS IN THE SOFTWARE. - - 4. Licensee's use of the LLVM third party component is - subject to the following terms and conditions: - - ====================================================== - LLVM Release License - ====================================================== - University of Illinois/NCSA - Open Source License - - Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. - All rights reserved. - - Developed by: - - LLVM Team - - University of Illinois at Urbana-Champaign - - http://llvm.org - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to - deal with the Software without restriction, including without limitation the - rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - sell copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimers in the - documentation and/or other materials provided with the distribution. - - * Neither the names of the LLVM Team, University of Illinois at Urbana- - Champaign, nor the names of its contributors may be used to endorse or - promote products derived from this Software without specific prior - written permission. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS WITH THE SOFTWARE. - - 5. Licensee's use (e.g. nvprof) of the PCRE third party - component is subject to the following terms and - conditions: - - ------------ - PCRE LICENCE - ------------ - PCRE is a library of functions to support regular expressions whose syntax - and semantics are as close as possible to those of the Perl 5 language. - Release 8 of PCRE is distributed under the terms of the "BSD" licence, as - specified below. The documentation for PCRE, supplied in the "doc" - directory, is distributed under the same terms as the software itself. The - basic library functions are written in C and are freestanding. Also - included in the distribution is a set of C++ wrapper functions, and a just- - in-time compiler that can be used to optimize pattern matching. These are - both optional features that can be omitted when the library is built. - - THE BASIC LIBRARY FUNCTIONS - --------------------------- - Written by: Philip Hazel - Email local part: ph10 - Email domain: cam.ac.uk - University of Cambridge Computing Service, - Cambridge, England. - Copyright (c) 1997-2012 University of Cambridge - All rights reserved. - - PCRE JUST-IN-TIME COMPILATION SUPPORT - ------------------------------------- - Written by: Zoltan Herczeg - Email local part: hzmester - Emain domain: freemail.hu - Copyright(c) 2010-2012 Zoltan Herczeg - All rights reserved. - - STACK-LESS JUST-IN-TIME COMPILER - -------------------------------- - Written by: Zoltan Herczeg - Email local part: hzmester - Emain domain: freemail.hu - Copyright(c) 2009-2012 Zoltan Herczeg - All rights reserved. - - THE C++ WRAPPER FUNCTIONS - ------------------------- - Contributed by: Google Inc. - Copyright (c) 2007-2012, Google Inc. - All rights reserved. - - THE "BSD" LICENCE - ----------------- - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the name of Google - Inc. nor the names of their contributors may be used to endorse or - promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - - 6. Some of the cuBLAS library routines were written by or - derived from code written by Vasily Volkov and are subject - to the Modified Berkeley Software Distribution License as - follows: - - Copyright (c) 2007-2009, Regents of the University of California - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the University of California, Berkeley nor - the names of its contributors may be used to endorse or promote - products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR - IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - - 7. Some of the cuBLAS library routines were written by or - derived from code written by Davide Barbieri and are - subject to the Modified Berkeley Software Distribution - License as follows: - - Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata. - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * The name of the author may not be used to endorse or promote - products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR - IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - - 8. Some of the cuBLAS library routines were derived from - code developed by the University of Tennessee and are - subject to the Modified Berkeley Software Distribution - License as follows: - - Copyright (c) 2010 The University of Tennessee. - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer listed in this license in the documentation and/or - other materials provided with the distribution. - * Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 9. Some of the cuBLAS library routines were written by or - derived from code written by Jonathan Hogg and are subject - to the Modified Berkeley Software Distribution License as - follows: - - Copyright (c) 2012, The Science and Technology Facilities Council (STFC). - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the STFC nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE - OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN - IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 10. Some of the cuBLAS library routines were written by or - derived from code written by Ahmad M. Abdelfattah, David - Keyes, and Hatem Ltaief, and are subject to the Apache - License, Version 2.0, as follows: - - -- (C) Copyright 2013 King Abdullah University of Science and Technology - Authors: - Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa) - David Keyes (david.keyes@kaust.edu.sa) - Hatem Ltaief (hatem.ltaief@kaust.edu.sa) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the King Abdullah University of Science and - Technology nor the names of its contributors may be used to endorse - or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE - - 11. Some of the cuSPARSE library routines were written by or - derived from code written by Li-Wen Chang and are subject - to the NCSA Open Source License as follows: - - Copyright (c) 2012, University of Illinois. - - All rights reserved. - - Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal with the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimers in the documentation and/or other materials provided - with the distribution. - * Neither the names of IMPACT Group, University of Illinois, nor - the names of its contributors may be used to endorse or promote - products derived from this Software without specific prior - written permission. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR - IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - - 12. Some of the cuRAND library routines were written by or - derived from code written by Mutsuo Saito and Makoto - Matsumoto and are subject to the following license: - - Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima - University. All rights reserved. - - Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima - University and University of Tokyo. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the Hiroshima University nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 13. Some of the cuRAND library routines were derived from - code developed by D. E. Shaw Research and are subject to - the following license: - - Copyright 2010-2011, D. E. Shaw Research. - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions, and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions, and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of D. E. Shaw Research nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 14. Some of the Math library routines were written by or - derived from code developed by Norbert Juffa and are - subject to the following license: - - Copyright (c) 2015-2017, Norbert Juffa - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 15. Licensee's use of the lz4 third party component is - subject to the following terms and conditions: - - Copyright (C) 2011-2013, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 16. The NPP library uses code from the Boost Math Toolkit, - and is subject to the following license: - - Boost Software License - Version 1.0 - August 17th, 2003 - . . . . - - Permission is hereby granted, free of charge, to any person or - organization obtaining a copy of the software and accompanying - documentation covered by this license (the "Software") to use, - reproduce, display, distribute, execute, and transmit the Software, - and to prepare derivative works of the Software, and to permit - third-parties to whom the Software is furnished to do so, all - subject to the following: - - The copyright notices in the Software and this entire statement, - including the above license grant, this restriction and the following - disclaimer, must be included in all copies of the Software, in whole - or in part, and all derivative works of the Software, unless such - copies or derivative works are solely in the form of machine-executable - object code generated by a source language processor. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND - NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR - ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR - OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - OTHER DEALINGS IN THE SOFTWARE. - - 17. Portions of the Nsight Eclipse Edition is subject to the - following license: - - The Eclipse Foundation makes available all content in this plug-in - ("Content"). Unless otherwise indicated below, the Content is provided - to you under the terms and conditions of the Eclipse Public License - Version 1.0 ("EPL"). A copy of the EPL is available at http:// - www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program" - will mean the Content. - - If you did not receive this Content directly from the Eclipse - Foundation, the Content is being redistributed by another party - ("Redistributor") and different terms and conditions may apply to your - use of any object code in the Content. Check the Redistributor's - license that was provided with the Content. If no such license exists, - contact the Redistributor. Unless otherwise indicated below, the terms - and conditions of the EPL still apply to any source code in the - Content and such source code may be obtained at http://www.eclipse.org. - - 18. Some of the cuBLAS library routines uses code from - OpenAI, which is subject to the following license: - - License URL - https://github.com/openai/openai-gemm/blob/master/LICENSE - - License Text - The MIT License - - Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - - 19. Licensee's use of the Visual Studio Setup Configuration - Samples is subject to the following license: - - The MIT License (MIT) - Copyright (C) Microsoft Corporation. All rights reserved. - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be included - in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - 20. Licensee's use of linmath.h header for CPU functions for - GL vector/matrix operations from lunarG is subject to the - Apache License Version 2.0. - - 21. The DX12-CUDA sample uses the d3dx12.h header, which is - subject to the MIT license . - ------------------ -``` - -### URLs - - `Homepage`: https://developer.nvidia.com/cuda-zone - - -## nvidia-cuda-cupti-cu12 (12.8.90) - -### Licenses -License: `NVIDIA Proprietary Software` - - - `License.txt`: -``` -End User License Agreement --------------------------- - - -Preface -------- - -The Software License Agreement in Chapter 1 and the Supplement -in Chapter 2 contain license terms and conditions that govern -the use of NVIDIA software. By accepting this agreement, you -agree to comply with all the terms and conditions applicable -to the product(s) included herein. - - -NVIDIA Driver - - -Description - -This package contains the operating system driver and -fundamental system software components for NVIDIA GPUs. - - -NVIDIA CUDA Toolkit - - -Description - -The NVIDIA CUDA Toolkit provides command-line and graphical -tools for building, debugging and optimizing the performance -of applications accelerated by NVIDIA GPUs, runtime and math -libraries, and documentation including programming guides, -user manuals, and API references. - - -Default Install Location of CUDA Toolkit - -Windows platform: - -%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.# - -Linux platform: - -/usr/local/cuda-#.# - -Mac platform: - -/Developer/NVIDIA/CUDA-#.# - - -NVIDIA CUDA Samples - - -Description - -This package includes over 100+ CUDA examples that demonstrate -various CUDA programming principles, and efficient CUDA -implementation of algorithms in specific application domains. - - -Default Install Location of CUDA Samples - -Windows platform: - -%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.# - -Linux platform: - -/usr/local/cuda-#.#/samples - -and - -$HOME/NVIDIA_CUDA-#.#_Samples - -Mac platform: - -/Developer/NVIDIA/CUDA-#.#/samples - - -NVIDIA Nsight Visual Studio Edition (Windows only) - - -Description - -NVIDIA Nsight Development Platform, Visual Studio Edition is a -development environment integrated into Microsoft Visual -Studio that provides tools for debugging, profiling, analyzing -and optimizing your GPU computing and graphics applications. - - -Default Install Location of Nsight Visual Studio Edition - -Windows platform: - -%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.# - - -1. License Agreement for NVIDIA Software Development Kits ---------------------------------------------------------- - - -Release Date: July 26, 2018 ---------------------------- - - -Important NoticeRead before downloading, installing, -copying or using the licensed software: -------------------------------------------------------- - -This license agreement, including exhibits attached -("Agreement”) is a legal agreement between you and NVIDIA -Corporation ("NVIDIA") and governs your use of a NVIDIA -software development kit (“SDK”). - -Each SDK has its own set of software and materials, but here -is a description of the types of items that may be included in -a SDK: source code, header files, APIs, data sets and assets -(examples include images, textures, models, scenes, videos, -native API input/output files), binary software, sample code, -libraries, utility programs, programming code and -documentation. - -This Agreement can be accepted only by an adult of legal age -of majority in the country in which the SDK is used. - -If you are entering into this Agreement on behalf of a company -or other legal entity, you represent that you have the legal -authority to bind the entity to this Agreement, in which case -“you” will mean the entity you represent. - -If you don’t have the required age or authority to accept -this Agreement, or if you don’t accept all the terms and -conditions of this Agreement, do not download, install or use -the SDK. - -You agree to use the SDK only for purposes that are permitted -by (a) this Agreement, and (b) any applicable law, regulation -or generally accepted practices or guidelines in the relevant -jurisdictions. - - -1.1. License - - -1.1.1. License Grant - -Subject to the terms of this Agreement, NVIDIA hereby grants -you a non-exclusive, non-transferable license, without the -right to sublicense (except as expressly provided in this -Agreement) to: - - 1. Install and use the SDK, - - 2. Modify and create derivative works of sample source code - delivered in the SDK, and - - 3. Distribute those portions of the SDK that are identified - in this Agreement as distributable, as incorporated in - object code format into a software application that meets - the distribution requirements indicated in this Agreement. - - -1.1.2. Distribution Requirements - -These are the distribution requirements for you to exercise -the distribution grant: - - 1. Your application must have material additional - functionality, beyond the included portions of the SDK. - - 2. The distributable portions of the SDK shall only be - accessed by your application. - - 3. The following notice shall be included in modifications - and derivative works of sample source code distributed: - “This software contains source code provided by NVIDIA - Corporation.” - - 4. Unless a developer tool is identified in this Agreement - as distributable, it is delivered for your internal use - only. - - 5. The terms under which you distribute your application - must be consistent with the terms of this Agreement, - including (without limitation) terms relating to the - license grant and license restrictions and protection of - NVIDIA’s intellectual property rights. Additionally, you - agree that you will protect the privacy, security and - legal rights of your application users. - - 6. You agree to notify NVIDIA in writing of any known or - suspected distribution or use of the SDK not in compliance - with the requirements of this Agreement, and to enforce - the terms of your agreements with respect to distributed - SDK. - - -1.1.3. Authorized Users - -You may allow employees and contractors of your entity or of -your subsidiary(ies) to access and use the SDK from your -secure network to perform work on your behalf. - -If you are an academic institution you may allow users -enrolled or employed by the academic institution to access and -use the SDK from your secure network. - -You are responsible for the compliance with the terms of this -Agreement by your authorized users. If you become aware that -your authorized users didn’t follow the terms of this -Agreement, you agree to take reasonable steps to resolve the -non-compliance and prevent new occurrences. - - -1.1.4. Pre-Release SDK - -The SDK versions identified as alpha, beta, preview or -otherwise as pre-release, may not be fully functional, may -contain errors or design flaws, and may have reduced or -different security, privacy, accessibility, availability, and -reliability standards relative to commercial versions of -NVIDIA software and materials. Use of a pre-release SDK may -result in unexpected results, loss of data, project delays or -other unpredictable damage or loss. - -You may use a pre-release SDK at your own risk, understanding -that pre-release SDKs are not intended for use in production -or business-critical systems. - -NVIDIA may choose not to make available a commercial version -of any pre-release SDK. NVIDIA may also choose to abandon -development and terminate the availability of a pre-release -SDK at any time without liability. - - -1.1.5. Updates - -NVIDIA may, at its option, make available patches, workarounds -or other updates to this SDK. Unless the updates are provided -with their separate governing terms, they are deemed part of -the SDK licensed to you as provided in this Agreement. You -agree that the form and content of the SDK that NVIDIA -provides may change without prior notice to you. While NVIDIA -generally maintains compatibility between versions, NVIDIA may -in some cases make changes that introduce incompatibilities in -future versions of the SDK. - - -1.1.6. Third Party Licenses - -The SDK may come bundled with, or otherwise include or be -distributed with, third party software licensed by a NVIDIA -supplier and/or open source software provided under an open -source license. Use of third party software is subject to the -third-party license terms, or in the absence of third party -terms, the terms of this Agreement. Copyright to third party -software is held by the copyright holders indicated in the -third-party software or license. - - -1.1.7. Reservation of Rights - -NVIDIA reserves all rights, title, and interest in and to the -SDK, not expressly granted to you under this Agreement. - - -1.2. Limitations - -The following license limitations apply to your use of the -SDK: - - 1. You may not reverse engineer, decompile or disassemble, - or remove copyright or other proprietary notices from any - portion of the SDK or copies of the SDK. - - 2. Except as expressly provided in this Agreement, you may - not copy, sell, rent, sublicense, transfer, distribute, - modify, or create derivative works of any portion of the - SDK. For clarity, you may not distribute or sublicense the - SDK as a stand-alone product. - - 3. Unless you have an agreement with NVIDIA for this - purpose, you may not indicate that an application created - with the SDK is sponsored or endorsed by NVIDIA. - - 4. You may not bypass, disable, or circumvent any - encryption, security, digital rights management or - authentication mechanism in the SDK. - - 5. You may not use the SDK in any manner that would cause it - to become subject to an open source software license. As - examples, licenses that require as a condition of use, - modification, and/or distribution that the SDK be: - - a. Disclosed or distributed in source code form; - - b. Licensed for the purpose of making derivative works; - or - - c. Redistributable at no charge. - - 6. Unless you have an agreement with NVIDIA for this - purpose, you may not use the SDK with any system or - application where the use or failure of the system or - application can reasonably be expected to threaten or - result in personal injury, death, or catastrophic loss. - Examples include use in avionics, navigation, military, - medical, life support or other life critical applications. - NVIDIA does not design, test or manufacture the SDK for - these critical uses and NVIDIA shall not be liable to you - or any third party, in whole or in part, for any claims or - damages arising from such uses. - - 7. You agree to defend, indemnify and hold harmless NVIDIA - and its affiliates, and their respective employees, - contractors, agents, officers and directors, from and - against any and all claims, damages, obligations, losses, - liabilities, costs or debt, fines, restitutions and - expenses (including but not limited to attorney’s fees - and costs incident to establishing the right of - indemnification) arising out of or related to your use of - the SDK outside of the scope of this Agreement, or not in - compliance with its terms. - - -1.3. Ownership - - 1. NVIDIA or its licensors hold all rights, title and - interest in and to the SDK and its modifications and - derivative works, including their respective intellectual - property rights, subject to your rights described in this - section. This SDK may include software and materials from - NVIDIA’s licensors, and these licensors are intended - third party beneficiaries that may enforce this Agreement - with respect to their intellectual property rights. - - 2. You hold all rights, title and interest in and to your - applications and your derivative works of the sample - source code delivered in the SDK, including their - respective intellectual property rights, subject to - NVIDIA’s rights described in this section. - - 3. You may, but don’t have to, provide to NVIDIA - suggestions, feature requests or other feedback regarding - the SDK, including possible enhancements or modifications - to the SDK. For any feedback that you voluntarily provide, - you hereby grant NVIDIA and its affiliates a perpetual, - non-exclusive, worldwide, irrevocable license to use, - reproduce, modify, license, sublicense (through multiple - tiers of sublicensees), and distribute (through multiple - tiers of distributors) it without the payment of any - royalties or fees to you. NVIDIA will use feedback at its - choice. NVIDIA is constantly looking for ways to improve - its products, so you may send feedback to NVIDIA through - the developer portal at https://developer.nvidia.com. - - -1.4. No Warranties - -THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL -FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND -ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND -OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, -BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE -ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO -WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF -DEALING OR COURSE OF TRADE. - - -1.5. Limitation of Liability - -TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS -AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, -PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS -OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF -PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION -WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, -WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH -OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), -PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF -LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES -TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS -AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE -NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS -LIMIT. - -These exclusions and limitations of liability shall apply -regardless if NVIDIA or its affiliates have been advised of -the possibility of such damages, and regardless of whether a -remedy fails its essential purpose. These exclusions and -limitations of liability form an essential basis of the -bargain between the parties, and, absent any of these -exclusions or limitations of liability, the provisions of this -Agreement, including, without limitation, the economic terms, -would be substantially different. - - -1.6. Termination - - 1. This Agreement will continue to apply until terminated by - either you or NVIDIA as described below. - - 2. If you want to terminate this Agreement, you may do so by - stopping to use the SDK. - - 3. NVIDIA may, at any time, terminate this Agreement if: - - a. (i) you fail to comply with any term of this - Agreement and the non-compliance is not fixed within - thirty (30) days following notice from NVIDIA (or - immediately if you violate NVIDIA’s intellectual - property rights); - - b. (ii) you commence or participate in any legal - proceeding against NVIDIA with respect to the SDK; or - - c. (iii) NVIDIA decides to no longer provide the SDK in - a country or, in NVIDIA’s sole discretion, the - continued use of it is no longer commercially viable. - - 4. Upon any termination of this Agreement, you agree to - promptly discontinue use of the SDK and destroy all copies - in your possession or control. Your prior distributions in - accordance with this Agreement are not affected by the - termination of this Agreement. Upon written request, you - will certify in writing that you have complied with your - commitments under this section. Upon any termination of - this Agreement all provisions survive except for the - license grant provisions. - - -1.7. General - -If you wish to assign this Agreement or your rights and -obligations, including by merger, consolidation, dissolution -or operation of law, contact NVIDIA to ask for permission. Any -attempted assignment not approved by NVIDIA in writing shall -be void and of no effect. NVIDIA may assign, delegate or -transfer this Agreement and its rights and obligations, and if -to a non-affiliate you will be notified. - -You agree to cooperate with NVIDIA and provide reasonably -requested information to verify your compliance with this -Agreement. - -This Agreement will be governed in all respects by the laws of -the United States and of the State of Delaware as those laws -are applied to contracts entered into and performed entirely -within Delaware by Delaware residents, without regard to the -conflicts of laws principles. The United Nations Convention on -Contracts for the International Sale of Goods is specifically -disclaimed. You agree to all terms of this Agreement in the -English language. - -The state or federal courts residing in Santa Clara County, -California shall have exclusive jurisdiction over any dispute -or claim arising out of this Agreement. Notwithstanding this, -you agree that NVIDIA shall still be allowed to apply for -injunctive remedies or an equivalent type of urgent legal -relief in any jurisdiction. - -If any court of competent jurisdiction determines that any -provision of this Agreement is illegal, invalid or -unenforceable, such provision will be construed as limited to -the extent necessary to be consistent with and fully -enforceable under the law and the remaining provisions will -remain in full force and effect. Unless otherwise specified, -remedies are cumulative. - -Each party acknowledges and agrees that the other is an -independent contractor in the performance of this Agreement. - -The SDK has been developed entirely at private expense and is -“commercial items” consisting of “commercial computer -software” and “commercial computer software -documentation” provided with RESTRICTED RIGHTS. Use, -duplication or disclosure by the U.S. Government or a U.S. -Government subcontractor is subject to the restrictions in -this Agreement pursuant to DFARS 227.7202-3(a) or as set forth -in subparagraphs (c)(1) and (2) of the Commercial Computer -Software - Restricted Rights clause at FAR 52.227-19, as -applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas -Expressway, Santa Clara, CA 95051. - -The SDK is subject to United States export laws and -regulations. You agree that you will not ship, transfer or -export the SDK into any country, or use the SDK in any manner, -prohibited by the United States Bureau of Industry and -Security or economic sanctions regulations administered by the -U.S. Department of Treasury’s Office of Foreign Assets -Control (OFAC), or any applicable export laws, restrictions or -regulations. These laws include restrictions on destinations, -end users and end use. By accepting this Agreement, you -confirm that you are not a resident or citizen of any country -currently embargoed by the U.S. and that you are not otherwise -prohibited from receiving the SDK. - -Any notice delivered by NVIDIA to you under this Agreement -will be delivered via mail, email or fax. You agree that any -notices that NVIDIA sends you electronically will satisfy any -legal communication requirements. Please direct your legal -notices or other correspondence to NVIDIA Corporation, 2788 -San Tomas Expressway, Santa Clara, California 95051, United -States of America, Attention: Legal Department. - -This Agreement and any exhibits incorporated into this -Agreement constitute the entire agreement of the parties with -respect to the subject matter of this Agreement and supersede -all prior negotiations or documentation exchanged between the -parties relating to this SDK license. Any additional and/or -conflicting terms on documents issued by you are null, void, -and invalid. Any amendment or waiver under this Agreement -shall be in writing and signed by representatives of both -parties. - - -2. CUDA Toolkit Supplement to Software License Agreement for -NVIDIA Software Development Kits ------------------------------------------------------------- - - -Release date: August 16, 2018 ------------------------------ - -The terms in this supplement govern your use of the NVIDIA -CUDA Toolkit SDK under the terms of your license agreement -(“Agreement”) as modified by this supplement. Capitalized -terms used but not defined below have the meaning assigned to -them in the Agreement. - -This supplement is an exhibit to the Agreement and is -incorporated as an integral part of the Agreement. In the -event of conflict between the terms in this supplement and the -terms in the Agreement, the terms in this supplement govern. - - -2.1. License Scope - -The SDK is licensed for you to develop applications only for -use in systems with NVIDIA GPUs. - - -2.2. Distribution - -The portions of the SDK that are distributable under the -Agreement are listed in Attachment A. - - -2.3. Operating Systems - -Those portions of the SDK designed exclusively for use on the -Linux or FreeBSD operating systems, or other operating systems -derived from the source code to these operating systems, may -be copied and redistributed for use in accordance with this -Agreement, provided that the object code files are not -modified in any way (except for unzipping of compressed -files). - - -2.4. Audio and Video Encoders and Decoders - -You acknowledge and agree that it is your sole responsibility -to obtain any additional third-party licenses required to -make, have made, use, have used, sell, import, and offer for -sale your products or services that include or incorporate any -third-party software and content relating to audio and/or -video encoders and decoders from, including but not limited -to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A., -MPEG-LA, and Coding Technologies. NVIDIA does not grant to you -under this Agreement any necessary patent or other rights with -respect to any audio and/or video encoders and decoders. - - -2.5. Licensing - -If the distribution terms in this Agreement are not suitable -for your organization, or for any questions regarding this -Agreement, please contact NVIDIA at -nvidia-compute-license-questions@nvidia.com. - - -2.6. Attachment A - -The following portions of the SDK are distributable under the -Agreement: - -Component - -CUDA Runtime - -Windows - -cudart.dll, cudart_static.lib, cudadevrt.lib - -Mac OSX - -libcudart.dylib, libcudart_static.a, libcudadevrt.a - -Linux - -libcudart.so, libcudart_static.a, libcudadevrt.a - -Android - -libcudart.so, libcudart_static.a, libcudadevrt.a - -Component - -CUDA FFT Library - -Windows - -cufft.dll, cufftw.dll, cufft.lib, cufftw.lib - -Mac OSX - -libcufft.dylib, libcufft_static.a, libcufftw.dylib, -libcufftw_static.a - -Linux - -libcufft.so, libcufft_static.a, libcufftw.so, -libcufftw_static.a - -Android - -libcufft.so, libcufft_static.a, libcufftw.so, -libcufftw_static.a - -Component - -CUDA BLAS Library - -Windows - -cublas.dll, cublasLt.dll - -Mac OSX - -libcublas.dylib, libcublasLt.dylib, libcublas_static.a, -libcublasLt_static.a - -Linux - -libcublas.so, libcublasLt.so, libcublas_static.a, -libcublasLt_static.a - -Android - -libcublas.so, libcublasLt.so, libcublas_static.a, -libcublasLt_static.a - -Component - -NVIDIA "Drop-in" BLAS Library - -Windows - -nvblas.dll - -Mac OSX - -libnvblas.dylib - -Linux - -libnvblas.so - -Component - -CUDA Sparse Matrix Library - -Windows - -cusparse.dll, cusparse.lib - -Mac OSX - -libcusparse.dylib, libcusparse_static.a - -Linux - -libcusparse.so, libcusparse_static.a - -Android - -libcusparse.so, libcusparse_static.a - -Component - -CUDA Linear Solver Library - -Windows - -cusolver.dll, cusolver.lib - -Mac OSX - -libcusolver.dylib, libcusolver_static.a - -Linux - -libcusolver.so, libcusolver_static.a - -Android - -libcusolver.so, libcusolver_static.a - -Component - -CUDA Random Number Generation Library - -Windows - -curand.dll, curand.lib - -Mac OSX - -libcurand.dylib, libcurand_static.a - -Linux - -libcurand.so, libcurand_static.a - -Android - -libcurand.so, libcurand_static.a - -Component - -CUDA Accelerated Graph Library - -Component - -NVIDIA Performance Primitives Library - -Windows - -nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll, -nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll, -nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib, -nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll, -nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib - -Mac OSX - -libnppc.dylib, libnppc_static.a, libnppial.dylib, -libnppial_static.a, libnppicc.dylib, libnppicc_static.a, -libnppicom.dylib, libnppicom_static.a, libnppidei.dylib, -libnppidei_static.a, libnppif.dylib, libnppif_static.a, -libnppig.dylib, libnppig_static.a, libnppim.dylib, -libnppisu_static.a, libnppitc.dylib, libnppitc_static.a, -libnpps.dylib, libnpps_static.a - -Linux - -libnppc.so, libnppc_static.a, libnppial.so, -libnppial_static.a, libnppicc.so, libnppicc_static.a, -libnppicom.so, libnppicom_static.a, libnppidei.so, -libnppidei_static.a, libnppif.so, libnppif_static.a -libnppig.so, libnppig_static.a, libnppim.so, -libnppim_static.a, libnppist.so, libnppist_static.a, -libnppisu.so, libnppisu_static.a, libnppitc.so -libnppitc_static.a, libnpps.so, libnpps_static.a - -Android - -libnppc.so, libnppc_static.a, libnppial.so, -libnppial_static.a, libnppicc.so, libnppicc_static.a, -libnppicom.so, libnppicom_static.a, libnppidei.so, -libnppidei_static.a, libnppif.so, libnppif_static.a -libnppig.so, libnppig_static.a, libnppim.so, -libnppim_static.a, libnppist.so, libnppist_static.a, -libnppisu.so, libnppisu_static.a, libnppitc.so -libnppitc_static.a, libnpps.so, libnpps_static.a - -Component - -NVIDIA JPEG Library - -Linux - -libnvjpeg.so, libnvjpeg_static.a - -Component - -Internal common library required for statically linking to -cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP - -Mac OSX - -libculibos.a - -Linux - -libculibos.a - -Component - -NVIDIA Runtime Compilation Library and Header - -All - -nvrtc.h - -Windows - -nvrtc.dll, nvrtc-builtins.dll - -Mac OSX - -libnvrtc.dylib, libnvrtc-builtins.dylib - -Linux - -libnvrtc.so, libnvrtc-builtins.so - -Component - -NVIDIA Optimizing Compiler Library - -Windows - -nvvm.dll - -Mac OSX - -libnvvm.dylib - -Linux - -libnvvm.so - -Component - -NVIDIA Common Device Math Functions Library - -Windows - -libdevice.10.bc - -Mac OSX - -libdevice.10.bc - -Linux - -libdevice.10.bc - -Component - -CUDA Occupancy Calculation Header Library - -All - -cuda_occupancy.h - -Component - -CUDA Half Precision Headers - -All - -cuda_fp16.h, cuda_fp16.hpp - -Component - -CUDA Profiling Tools Interface (CUPTI) Library - -Windows - -cupti.dll - -Mac OSX - -libcupti.dylib - -Linux - -libcupti.so - -Component - -NVIDIA Tools Extension Library - -Windows - -nvToolsExt.dll, nvToolsExt.lib - -Mac OSX - -libnvToolsExt.dylib - -Linux - -libnvToolsExt.so - -Component - -NVIDIA CUDA Driver Libraries - -Linux - -libcuda.so, libnvidia-fatbinaryloader.so, -libnvidia-ptxjitcompiler.so - -The NVIDIA CUDA Driver Libraries are only distributable in -applications that meet this criteria: - - 1. The application was developed starting from a NVIDIA CUDA - container obtained from Docker Hub or the NVIDIA GPU - Cloud, and - - 2. The resulting application is packaged as a Docker - container and distributed to users on Docker Hub or the - NVIDIA GPU Cloud only. - - -2.7. Attachment B - - -Additional Licensing Obligations - -The following third party components included in the SOFTWARE -are licensed to Licensee pursuant to the following terms and -conditions: - - 1. Licensee's use of the GDB third party component is - subject to the terms and conditions of GNU GPL v3: - - This product includes copyrighted third-party software licensed - under the terms of the GNU General Public License v3 ("GPL v3"). - All third-party software packages are copyright by their respective - authors. GPL v3 terms and conditions are hereby incorporated into - the Agreement by this reference: http://www.gnu.org/licenses/gpl.txt - - Consistent with these licensing requirements, the software - listed below is provided under the terms of the specified - open source software licenses. To obtain source code for - software provided under licenses that require - redistribution of source code, including the GNU General - Public License (GPL) and GNU Lesser General Public License - (LGPL), contact oss-requests@nvidia.com. This offer is - valid for a period of three (3) years from the date of the - distribution of this product by NVIDIA CORPORATION. - - Component License - CUDA-GDB GPL v3 - - 2. Licensee represents and warrants that any and all third - party licensing and/or royalty payment obligations in - connection with Licensee's use of the H.264 video codecs - are solely the responsibility of Licensee. - - 3. Licensee's use of the Thrust library is subject to the - terms and conditions of the Apache License Version 2.0. - All third-party software packages are copyright by their - respective authors. Apache License Version 2.0 terms and - conditions are hereby incorporated into the Agreement by - this reference. - http://www.apache.org/licenses/LICENSE-2.0.html - - In addition, Licensee acknowledges the following notice: - Thrust includes source code from the Boost Iterator, - Tuple, System, and Random Number libraries. - - Boost Software License - Version 1.0 - August 17th, 2003 - . . . . - - Permission is hereby granted, free of charge, to any person or - organization obtaining a copy of the software and accompanying - documentation covered by this license (the "Software") to use, - reproduce, display, distribute, execute, and transmit the Software, - and to prepare derivative works of the Software, and to permit - third-parties to whom the Software is furnished to do so, all - subject to the following: - - The copyright notices in the Software and this entire statement, - including the above license grant, this restriction and the following - disclaimer, must be included in all copies of the Software, in whole - or in part, and all derivative works of the Software, unless such - copies or derivative works are solely in the form of machine-executable - object code generated by a source language processor. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND - NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR - ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR - OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - OTHER DEALINGS IN THE SOFTWARE. - - 4. Licensee's use of the LLVM third party component is - subject to the following terms and conditions: - - ====================================================== - LLVM Release License - ====================================================== - University of Illinois/NCSA - Open Source License - - Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. - All rights reserved. - - Developed by: - - LLVM Team - - University of Illinois at Urbana-Champaign - - http://llvm.org - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to - deal with the Software without restriction, including without limitation the - rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - sell copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimers in the - documentation and/or other materials provided with the distribution. - - * Neither the names of the LLVM Team, University of Illinois at Urbana- - Champaign, nor the names of its contributors may be used to endorse or - promote products derived from this Software without specific prior - written permission. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS WITH THE SOFTWARE. - - 5. Licensee's use (e.g. nvprof) of the PCRE third party - component is subject to the following terms and - conditions: - - ------------ - PCRE LICENCE - ------------ - PCRE is a library of functions to support regular expressions whose syntax - and semantics are as close as possible to those of the Perl 5 language. - Release 8 of PCRE is distributed under the terms of the "BSD" licence, as - specified below. The documentation for PCRE, supplied in the "doc" - directory, is distributed under the same terms as the software itself. The - basic library functions are written in C and are freestanding. Also - included in the distribution is a set of C++ wrapper functions, and a just- - in-time compiler that can be used to optimize pattern matching. These are - both optional features that can be omitted when the library is built. - - THE BASIC LIBRARY FUNCTIONS - --------------------------- - Written by: Philip Hazel - Email local part: ph10 - Email domain: cam.ac.uk - University of Cambridge Computing Service, - Cambridge, England. - Copyright (c) 1997-2012 University of Cambridge - All rights reserved. - - PCRE JUST-IN-TIME COMPILATION SUPPORT - ------------------------------------- - Written by: Zoltan Herczeg - Email local part: hzmester - Emain domain: freemail.hu - Copyright(c) 2010-2012 Zoltan Herczeg - All rights reserved. - - STACK-LESS JUST-IN-TIME COMPILER - -------------------------------- - Written by: Zoltan Herczeg - Email local part: hzmester - Emain domain: freemail.hu - Copyright(c) 2009-2012 Zoltan Herczeg - All rights reserved. - - THE C++ WRAPPER FUNCTIONS - ------------------------- - Contributed by: Google Inc. - Copyright (c) 2007-2012, Google Inc. - All rights reserved. - - THE "BSD" LICENCE - ----------------- - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the name of Google - Inc. nor the names of their contributors may be used to endorse or - promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - - 6. Some of the cuBLAS library routines were written by or - derived from code written by Vasily Volkov and are subject - to the Modified Berkeley Software Distribution License as - follows: - - Copyright (c) 2007-2009, Regents of the University of California - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the University of California, Berkeley nor - the names of its contributors may be used to endorse or promote - products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR - IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - - 7. Some of the cuBLAS library routines were written by or - derived from code written by Davide Barbieri and are - subject to the Modified Berkeley Software Distribution - License as follows: - - Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata. - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * The name of the author may not be used to endorse or promote - products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR - IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - - 8. Some of the cuBLAS library routines were derived from - code developed by the University of Tennessee and are - subject to the Modified Berkeley Software Distribution - License as follows: - - Copyright (c) 2010 The University of Tennessee. - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer listed in this license in the documentation and/or - other materials provided with the distribution. - * Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 9. Some of the cuBLAS library routines were written by or - derived from code written by Jonathan Hogg and are subject - to the Modified Berkeley Software Distribution License as - follows: - - Copyright (c) 2012, The Science and Technology Facilities Council (STFC). - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the STFC nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE - OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN - IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 10. Some of the cuBLAS library routines were written by or - derived from code written by Ahmad M. Abdelfattah, David - Keyes, and Hatem Ltaief, and are subject to the Apache - License, Version 2.0, as follows: - - -- (C) Copyright 2013 King Abdullah University of Science and Technology - Authors: - Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa) - David Keyes (david.keyes@kaust.edu.sa) - Hatem Ltaief (hatem.ltaief@kaust.edu.sa) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the King Abdullah University of Science and - Technology nor the names of its contributors may be used to endorse - or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE - - 11. Some of the cuSPARSE library routines were written by or - derived from code written by Li-Wen Chang and are subject - to the NCSA Open Source License as follows: - - Copyright (c) 2012, University of Illinois. - - All rights reserved. - - Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal with the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimers in the documentation and/or other materials provided - with the distribution. - * Neither the names of IMPACT Group, University of Illinois, nor - the names of its contributors may be used to endorse or promote - products derived from this Software without specific prior - written permission. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR - IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - - 12. Some of the cuRAND library routines were written by or - derived from code written by Mutsuo Saito and Makoto - Matsumoto and are subject to the following license: - - Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima - University. All rights reserved. - - Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima - University and University of Tokyo. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the Hiroshima University nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 13. Some of the cuRAND library routines were derived from - code developed by D. E. Shaw Research and are subject to - the following license: - - Copyright 2010-2011, D. E. Shaw Research. - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions, and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions, and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of D. E. Shaw Research nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 14. Some of the Math library routines were written by or - derived from code developed by Norbert Juffa and are - subject to the following license: - - Copyright (c) 2015-2017, Norbert Juffa - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 15. Licensee's use of the lz4 third party component is - subject to the following terms and conditions: - - Copyright (C) 2011-2013, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - 16. The NPP library uses code from the Boost Math Toolkit, - and is subject to the following license: - - Boost Software License - Version 1.0 - August 17th, 2003 - . . . . - - Permission is hereby granted, free of charge, to any person or - organization obtaining a copy of the software and accompanying - documentation covered by this license (the "Software") to use, - reproduce, display, distribute, execute, and transmit the Software, - and to prepare derivative works of the Software, and to permit - third-parties to whom the Software is furnished to do so, all - subject to the following: - - The copyright notices in the Software and this entire statement, - including the above license grant, this restriction and the following - disclaimer, must be included in all copies of the Software, in whole - or in part, and all derivative works of the Software, unless such - copies or derivative works are solely in the form of machine-executable - object code generated by a source language processor. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND - NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR - ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR - OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - OTHER DEALINGS IN THE SOFTWARE. - - 17. Portions of the Nsight Eclipse Edition is subject to the - following license: - - The Eclipse Foundation makes available all content in this plug-in - ("Content"). Unless otherwise indicated below, the Content is provided - to you under the terms and conditions of the Eclipse Public License - Version 1.0 ("EPL"). A copy of the EPL is available at http:// - www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program" - will mean the Content. - - If you did not receive this Content directly from the Eclipse - Foundation, the Content is being redistributed by another party - ("Redistributor") and different terms and conditions may apply to your - use of any object code in the Content. Check the Redistributor's - license that was provided with the Content. If no such license exists, - contact the Redistributor. Unless otherwise indicated below, the terms - and conditions of the EPL still apply to any source code in the - Content and such source code may be obtained at http://www.eclipse.org. - - 18. Some of the cuBLAS library routines uses code from - OpenAI, which is subject to the following license: - - License URL - https://github.com/openai/openai-gemm/blob/master/LICENSE - - License Text - The MIT License - - Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - - 19. Licensee's use of the Visual Studio Setup Configuration - Samples is subject to the following license: - - The MIT License (MIT) - Copyright (C) Microsoft Corporation. All rights reserved. - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be included - in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - 20. Licensee's use of linmath.h header for CPU functions for - GL vector/matrix operations from lunarG is subject to the - Apache License Version 2.0. - - 21. The DX12-CUDA sample uses the d3dx12.h header, which is - subject to the MIT license . - ------------------ -``` - -### URLs - - `Homepage`: https://developer.nvidia.com/cuda-zone - - -## nvidia-cuda-nvrtc (13.0.88) +## nvidia-cublas (13.0.0.19) ### Licenses License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -13965,12 +20373,12 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cuda-nvrtc-cu12 (12.8.93) +## nvidia-cuda-cupti (13.0.48) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -15546,12 +21954,1593 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cuda-runtime-cu12 (12.8.90) +## nvidia-cuda-nvrtc (13.0.48) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: +``` +End User License Agreement +-------------------------- + + +Preface +------- + +The Software License Agreement in Chapter 1 and the Supplement +in Chapter 2 contain license terms and conditions that govern +the use of NVIDIA software. By accepting this agreement, you +agree to comply with all the terms and conditions applicable +to the product(s) included herein. + + +NVIDIA Driver + + +Description + +This package contains the operating system driver and +fundamental system software components for NVIDIA GPUs. + + +NVIDIA CUDA Toolkit + + +Description + +The NVIDIA CUDA Toolkit provides command-line and graphical +tools for building, debugging and optimizing the performance +of applications accelerated by NVIDIA GPUs, runtime and math +libraries, and documentation including programming guides, +user manuals, and API references. + + +Default Install Location of CUDA Toolkit + +Windows platform: + +%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.# + +Linux platform: + +/usr/local/cuda-#.# + +Mac platform: + +/Developer/NVIDIA/CUDA-#.# + + +NVIDIA CUDA Samples + + +Description + +This package includes over 100+ CUDA examples that demonstrate +various CUDA programming principles, and efficient CUDA +implementation of algorithms in specific application domains. + + +Default Install Location of CUDA Samples + +Windows platform: + +%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.# + +Linux platform: + +/usr/local/cuda-#.#/samples + +and + +$HOME/NVIDIA_CUDA-#.#_Samples + +Mac platform: + +/Developer/NVIDIA/CUDA-#.#/samples + + +NVIDIA Nsight Visual Studio Edition (Windows only) + + +Description + +NVIDIA Nsight Development Platform, Visual Studio Edition is a +development environment integrated into Microsoft Visual +Studio that provides tools for debugging, profiling, analyzing +and optimizing your GPU computing and graphics applications. + + +Default Install Location of Nsight Visual Studio Edition + +Windows platform: + +%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.# + + +1. License Agreement for NVIDIA Software Development Kits +--------------------------------------------------------- + + +Release Date: July 26, 2018 +--------------------------- + + +Important NoticeRead before downloading, installing, +copying or using the licensed software: +------------------------------------------------------- + +This license agreement, including exhibits attached +("Agreement”) is a legal agreement between you and NVIDIA +Corporation ("NVIDIA") and governs your use of a NVIDIA +software development kit (“SDK”). + +Each SDK has its own set of software and materials, but here +is a description of the types of items that may be included in +a SDK: source code, header files, APIs, data sets and assets +(examples include images, textures, models, scenes, videos, +native API input/output files), binary software, sample code, +libraries, utility programs, programming code and +documentation. + +This Agreement can be accepted only by an adult of legal age +of majority in the country in which the SDK is used. + +If you are entering into this Agreement on behalf of a company +or other legal entity, you represent that you have the legal +authority to bind the entity to this Agreement, in which case +“you” will mean the entity you represent. + +If you don’t have the required age or authority to accept +this Agreement, or if you don’t accept all the terms and +conditions of this Agreement, do not download, install or use +the SDK. + +You agree to use the SDK only for purposes that are permitted +by (a) this Agreement, and (b) any applicable law, regulation +or generally accepted practices or guidelines in the relevant +jurisdictions. + + +1.1. License + + +1.1.1. License Grant + +Subject to the terms of this Agreement, NVIDIA hereby grants +you a non-exclusive, non-transferable license, without the +right to sublicense (except as expressly provided in this +Agreement) to: + + 1. Install and use the SDK, + + 2. Modify and create derivative works of sample source code + delivered in the SDK, and + + 3. Distribute those portions of the SDK that are identified + in this Agreement as distributable, as incorporated in + object code format into a software application that meets + the distribution requirements indicated in this Agreement. + + +1.1.2. Distribution Requirements + +These are the distribution requirements for you to exercise +the distribution grant: + + 1. Your application must have material additional + functionality, beyond the included portions of the SDK. + + 2. The distributable portions of the SDK shall only be + accessed by your application. + + 3. The following notice shall be included in modifications + and derivative works of sample source code distributed: + “This software contains source code provided by NVIDIA + Corporation.” + + 4. Unless a developer tool is identified in this Agreement + as distributable, it is delivered for your internal use + only. + + 5. The terms under which you distribute your application + must be consistent with the terms of this Agreement, + including (without limitation) terms relating to the + license grant and license restrictions and protection of + NVIDIA’s intellectual property rights. Additionally, you + agree that you will protect the privacy, security and + legal rights of your application users. + + 6. You agree to notify NVIDIA in writing of any known or + suspected distribution or use of the SDK not in compliance + with the requirements of this Agreement, and to enforce + the terms of your agreements with respect to distributed + SDK. + + +1.1.3. Authorized Users + +You may allow employees and contractors of your entity or of +your subsidiary(ies) to access and use the SDK from your +secure network to perform work on your behalf. + +If you are an academic institution you may allow users +enrolled or employed by the academic institution to access and +use the SDK from your secure network. + +You are responsible for the compliance with the terms of this +Agreement by your authorized users. If you become aware that +your authorized users didn’t follow the terms of this +Agreement, you agree to take reasonable steps to resolve the +non-compliance and prevent new occurrences. + + +1.1.4. Pre-Release SDK + +The SDK versions identified as alpha, beta, preview or +otherwise as pre-release, may not be fully functional, may +contain errors or design flaws, and may have reduced or +different security, privacy, accessibility, availability, and +reliability standards relative to commercial versions of +NVIDIA software and materials. Use of a pre-release SDK may +result in unexpected results, loss of data, project delays or +other unpredictable damage or loss. + +You may use a pre-release SDK at your own risk, understanding +that pre-release SDKs are not intended for use in production +or business-critical systems. + +NVIDIA may choose not to make available a commercial version +of any pre-release SDK. NVIDIA may also choose to abandon +development and terminate the availability of a pre-release +SDK at any time without liability. + + +1.1.5. Updates + +NVIDIA may, at its option, make available patches, workarounds +or other updates to this SDK. Unless the updates are provided +with their separate governing terms, they are deemed part of +the SDK licensed to you as provided in this Agreement. You +agree that the form and content of the SDK that NVIDIA +provides may change without prior notice to you. While NVIDIA +generally maintains compatibility between versions, NVIDIA may +in some cases make changes that introduce incompatibilities in +future versions of the SDK. + + +1.1.6. Third Party Licenses + +The SDK may come bundled with, or otherwise include or be +distributed with, third party software licensed by a NVIDIA +supplier and/or open source software provided under an open +source license. Use of third party software is subject to the +third-party license terms, or in the absence of third party +terms, the terms of this Agreement. Copyright to third party +software is held by the copyright holders indicated in the +third-party software or license. + + +1.1.7. Reservation of Rights + +NVIDIA reserves all rights, title, and interest in and to the +SDK, not expressly granted to you under this Agreement. + + +1.2. Limitations + +The following license limitations apply to your use of the +SDK: + + 1. You may not reverse engineer, decompile or disassemble, + or remove copyright or other proprietary notices from any + portion of the SDK or copies of the SDK. + + 2. Except as expressly provided in this Agreement, you may + not copy, sell, rent, sublicense, transfer, distribute, + modify, or create derivative works of any portion of the + SDK. For clarity, you may not distribute or sublicense the + SDK as a stand-alone product. + + 3. Unless you have an agreement with NVIDIA for this + purpose, you may not indicate that an application created + with the SDK is sponsored or endorsed by NVIDIA. + + 4. You may not bypass, disable, or circumvent any + encryption, security, digital rights management or + authentication mechanism in the SDK. + + 5. You may not use the SDK in any manner that would cause it + to become subject to an open source software license. As + examples, licenses that require as a condition of use, + modification, and/or distribution that the SDK be: + + a. Disclosed or distributed in source code form; + + b. Licensed for the purpose of making derivative works; + or + + c. Redistributable at no charge. + + 6. Unless you have an agreement with NVIDIA for this + purpose, you may not use the SDK with any system or + application where the use or failure of the system or + application can reasonably be expected to threaten or + result in personal injury, death, or catastrophic loss. + Examples include use in avionics, navigation, military, + medical, life support or other life critical applications. + NVIDIA does not design, test or manufacture the SDK for + these critical uses and NVIDIA shall not be liable to you + or any third party, in whole or in part, for any claims or + damages arising from such uses. + + 7. You agree to defend, indemnify and hold harmless NVIDIA + and its affiliates, and their respective employees, + contractors, agents, officers and directors, from and + against any and all claims, damages, obligations, losses, + liabilities, costs or debt, fines, restitutions and + expenses (including but not limited to attorney’s fees + and costs incident to establishing the right of + indemnification) arising out of or related to your use of + the SDK outside of the scope of this Agreement, or not in + compliance with its terms. + + +1.3. Ownership + + 1. NVIDIA or its licensors hold all rights, title and + interest in and to the SDK and its modifications and + derivative works, including their respective intellectual + property rights, subject to your rights described in this + section. This SDK may include software and materials from + NVIDIA’s licensors, and these licensors are intended + third party beneficiaries that may enforce this Agreement + with respect to their intellectual property rights. + + 2. You hold all rights, title and interest in and to your + applications and your derivative works of the sample + source code delivered in the SDK, including their + respective intellectual property rights, subject to + NVIDIA’s rights described in this section. + + 3. You may, but don’t have to, provide to NVIDIA + suggestions, feature requests or other feedback regarding + the SDK, including possible enhancements or modifications + to the SDK. For any feedback that you voluntarily provide, + you hereby grant NVIDIA and its affiliates a perpetual, + non-exclusive, worldwide, irrevocable license to use, + reproduce, modify, license, sublicense (through multiple + tiers of sublicensees), and distribute (through multiple + tiers of distributors) it without the payment of any + royalties or fees to you. NVIDIA will use feedback at its + choice. NVIDIA is constantly looking for ways to improve + its products, so you may send feedback to NVIDIA through + the developer portal at https://developer.nvidia.com. + + +1.4. No Warranties + +THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL +FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND +ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND +OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, +BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE +ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO +WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF +DEALING OR COURSE OF TRADE. + + +1.5. Limitation of Liability + +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS +AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, +PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS +OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF +PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION +WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, +WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH +OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), +PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF +LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES +TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS +AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE +NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS +LIMIT. + +These exclusions and limitations of liability shall apply +regardless if NVIDIA or its affiliates have been advised of +the possibility of such damages, and regardless of whether a +remedy fails its essential purpose. These exclusions and +limitations of liability form an essential basis of the +bargain between the parties, and, absent any of these +exclusions or limitations of liability, the provisions of this +Agreement, including, without limitation, the economic terms, +would be substantially different. + + +1.6. Termination + + 1. This Agreement will continue to apply until terminated by + either you or NVIDIA as described below. + + 2. If you want to terminate this Agreement, you may do so by + stopping to use the SDK. + + 3. NVIDIA may, at any time, terminate this Agreement if: + + a. (i) you fail to comply with any term of this + Agreement and the non-compliance is not fixed within + thirty (30) days following notice from NVIDIA (or + immediately if you violate NVIDIA’s intellectual + property rights); + + b. (ii) you commence or participate in any legal + proceeding against NVIDIA with respect to the SDK; or + + c. (iii) NVIDIA decides to no longer provide the SDK in + a country or, in NVIDIA’s sole discretion, the + continued use of it is no longer commercially viable. + + 4. Upon any termination of this Agreement, you agree to + promptly discontinue use of the SDK and destroy all copies + in your possession or control. Your prior distributions in + accordance with this Agreement are not affected by the + termination of this Agreement. Upon written request, you + will certify in writing that you have complied with your + commitments under this section. Upon any termination of + this Agreement all provisions survive except for the + license grant provisions. + + +1.7. General + +If you wish to assign this Agreement or your rights and +obligations, including by merger, consolidation, dissolution +or operation of law, contact NVIDIA to ask for permission. Any +attempted assignment not approved by NVIDIA in writing shall +be void and of no effect. NVIDIA may assign, delegate or +transfer this Agreement and its rights and obligations, and if +to a non-affiliate you will be notified. + +You agree to cooperate with NVIDIA and provide reasonably +requested information to verify your compliance with this +Agreement. + +This Agreement will be governed in all respects by the laws of +the United States and of the State of Delaware as those laws +are applied to contracts entered into and performed entirely +within Delaware by Delaware residents, without regard to the +conflicts of laws principles. The United Nations Convention on +Contracts for the International Sale of Goods is specifically +disclaimed. You agree to all terms of this Agreement in the +English language. + +The state or federal courts residing in Santa Clara County, +California shall have exclusive jurisdiction over any dispute +or claim arising out of this Agreement. Notwithstanding this, +you agree that NVIDIA shall still be allowed to apply for +injunctive remedies or an equivalent type of urgent legal +relief in any jurisdiction. + +If any court of competent jurisdiction determines that any +provision of this Agreement is illegal, invalid or +unenforceable, such provision will be construed as limited to +the extent necessary to be consistent with and fully +enforceable under the law and the remaining provisions will +remain in full force and effect. Unless otherwise specified, +remedies are cumulative. + +Each party acknowledges and agrees that the other is an +independent contractor in the performance of this Agreement. + +The SDK has been developed entirely at private expense and is +“commercial items” consisting of “commercial computer +software” and “commercial computer software +documentation” provided with RESTRICTED RIGHTS. Use, +duplication or disclosure by the U.S. Government or a U.S. +Government subcontractor is subject to the restrictions in +this Agreement pursuant to DFARS 227.7202-3(a) or as set forth +in subparagraphs (c)(1) and (2) of the Commercial Computer +Software - Restricted Rights clause at FAR 52.227-19, as +applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas +Expressway, Santa Clara, CA 95051. + +The SDK is subject to United States export laws and +regulations. You agree that you will not ship, transfer or +export the SDK into any country, or use the SDK in any manner, +prohibited by the United States Bureau of Industry and +Security or economic sanctions regulations administered by the +U.S. Department of Treasury’s Office of Foreign Assets +Control (OFAC), or any applicable export laws, restrictions or +regulations. These laws include restrictions on destinations, +end users and end use. By accepting this Agreement, you +confirm that you are not a resident or citizen of any country +currently embargoed by the U.S. and that you are not otherwise +prohibited from receiving the SDK. + +Any notice delivered by NVIDIA to you under this Agreement +will be delivered via mail, email or fax. You agree that any +notices that NVIDIA sends you electronically will satisfy any +legal communication requirements. Please direct your legal +notices or other correspondence to NVIDIA Corporation, 2788 +San Tomas Expressway, Santa Clara, California 95051, United +States of America, Attention: Legal Department. + +This Agreement and any exhibits incorporated into this +Agreement constitute the entire agreement of the parties with +respect to the subject matter of this Agreement and supersede +all prior negotiations or documentation exchanged between the +parties relating to this SDK license. Any additional and/or +conflicting terms on documents issued by you are null, void, +and invalid. Any amendment or waiver under this Agreement +shall be in writing and signed by representatives of both +parties. + + +2. CUDA Toolkit Supplement to Software License Agreement for +NVIDIA Software Development Kits +------------------------------------------------------------ + + +Release date: August 16, 2018 +----------------------------- + +The terms in this supplement govern your use of the NVIDIA +CUDA Toolkit SDK under the terms of your license agreement +(“Agreement”) as modified by this supplement. Capitalized +terms used but not defined below have the meaning assigned to +them in the Agreement. + +This supplement is an exhibit to the Agreement and is +incorporated as an integral part of the Agreement. In the +event of conflict between the terms in this supplement and the +terms in the Agreement, the terms in this supplement govern. + + +2.1. License Scope + +The SDK is licensed for you to develop applications only for +use in systems with NVIDIA GPUs. + + +2.2. Distribution + +The portions of the SDK that are distributable under the +Agreement are listed in Attachment A. + + +2.3. Operating Systems + +Those portions of the SDK designed exclusively for use on the +Linux or FreeBSD operating systems, or other operating systems +derived from the source code to these operating systems, may +be copied and redistributed for use in accordance with this +Agreement, provided that the object code files are not +modified in any way (except for unzipping of compressed +files). + + +2.4. Audio and Video Encoders and Decoders + +You acknowledge and agree that it is your sole responsibility +to obtain any additional third-party licenses required to +make, have made, use, have used, sell, import, and offer for +sale your products or services that include or incorporate any +third-party software and content relating to audio and/or +video encoders and decoders from, including but not limited +to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A., +MPEG-LA, and Coding Technologies. NVIDIA does not grant to you +under this Agreement any necessary patent or other rights with +respect to any audio and/or video encoders and decoders. + + +2.5. Licensing + +If the distribution terms in this Agreement are not suitable +for your organization, or for any questions regarding this +Agreement, please contact NVIDIA at +nvidia-compute-license-questions@nvidia.com. + + +2.6. Attachment A + +The following portions of the SDK are distributable under the +Agreement: + +Component + +CUDA Runtime + +Windows + +cudart.dll, cudart_static.lib, cudadevrt.lib + +Mac OSX + +libcudart.dylib, libcudart_static.a, libcudadevrt.a + +Linux + +libcudart.so, libcudart_static.a, libcudadevrt.a + +Android + +libcudart.so, libcudart_static.a, libcudadevrt.a + +Component + +CUDA FFT Library + +Windows + +cufft.dll, cufftw.dll, cufft.lib, cufftw.lib + +Mac OSX + +libcufft.dylib, libcufft_static.a, libcufftw.dylib, +libcufftw_static.a + +Linux + +libcufft.so, libcufft_static.a, libcufftw.so, +libcufftw_static.a + +Android + +libcufft.so, libcufft_static.a, libcufftw.so, +libcufftw_static.a + +Component + +CUDA BLAS Library + +Windows + +cublas.dll, cublasLt.dll + +Mac OSX + +libcublas.dylib, libcublasLt.dylib, libcublas_static.a, +libcublasLt_static.a + +Linux + +libcublas.so, libcublasLt.so, libcublas_static.a, +libcublasLt_static.a + +Android + +libcublas.so, libcublasLt.so, libcublas_static.a, +libcublasLt_static.a + +Component + +NVIDIA "Drop-in" BLAS Library + +Windows + +nvblas.dll + +Mac OSX + +libnvblas.dylib + +Linux + +libnvblas.so + +Component + +CUDA Sparse Matrix Library + +Windows + +cusparse.dll, cusparse.lib + +Mac OSX + +libcusparse.dylib, libcusparse_static.a + +Linux + +libcusparse.so, libcusparse_static.a + +Android + +libcusparse.so, libcusparse_static.a + +Component + +CUDA Linear Solver Library + +Windows + +cusolver.dll, cusolver.lib + +Mac OSX + +libcusolver.dylib, libcusolver_static.a + +Linux + +libcusolver.so, libcusolver_static.a + +Android + +libcusolver.so, libcusolver_static.a + +Component + +CUDA Random Number Generation Library + +Windows + +curand.dll, curand.lib + +Mac OSX + +libcurand.dylib, libcurand_static.a + +Linux + +libcurand.so, libcurand_static.a + +Android + +libcurand.so, libcurand_static.a + +Component + +CUDA Accelerated Graph Library + +Component + +NVIDIA Performance Primitives Library + +Windows + +nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll, +nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll, +nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib, +nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll, +nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib + +Mac OSX + +libnppc.dylib, libnppc_static.a, libnppial.dylib, +libnppial_static.a, libnppicc.dylib, libnppicc_static.a, +libnppicom.dylib, libnppicom_static.a, libnppidei.dylib, +libnppidei_static.a, libnppif.dylib, libnppif_static.a, +libnppig.dylib, libnppig_static.a, libnppim.dylib, +libnppisu_static.a, libnppitc.dylib, libnppitc_static.a, +libnpps.dylib, libnpps_static.a + +Linux + +libnppc.so, libnppc_static.a, libnppial.so, +libnppial_static.a, libnppicc.so, libnppicc_static.a, +libnppicom.so, libnppicom_static.a, libnppidei.so, +libnppidei_static.a, libnppif.so, libnppif_static.a +libnppig.so, libnppig_static.a, libnppim.so, +libnppim_static.a, libnppist.so, libnppist_static.a, +libnppisu.so, libnppisu_static.a, libnppitc.so +libnppitc_static.a, libnpps.so, libnpps_static.a + +Android + +libnppc.so, libnppc_static.a, libnppial.so, +libnppial_static.a, libnppicc.so, libnppicc_static.a, +libnppicom.so, libnppicom_static.a, libnppidei.so, +libnppidei_static.a, libnppif.so, libnppif_static.a +libnppig.so, libnppig_static.a, libnppim.so, +libnppim_static.a, libnppist.so, libnppist_static.a, +libnppisu.so, libnppisu_static.a, libnppitc.so +libnppitc_static.a, libnpps.so, libnpps_static.a + +Component + +NVIDIA JPEG Library + +Linux + +libnvjpeg.so, libnvjpeg_static.a + +Component + +Internal common library required for statically linking to +cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP + +Mac OSX + +libculibos.a + +Linux + +libculibos.a + +Component + +NVIDIA Runtime Compilation Library and Header + +All + +nvrtc.h + +Windows + +nvrtc.dll, nvrtc-builtins.dll + +Mac OSX + +libnvrtc.dylib, libnvrtc-builtins.dylib + +Linux + +libnvrtc.so, libnvrtc-builtins.so + +Component + +NVIDIA Optimizing Compiler Library + +Windows + +nvvm.dll + +Mac OSX + +libnvvm.dylib + +Linux + +libnvvm.so + +Component + +NVIDIA Common Device Math Functions Library + +Windows + +libdevice.10.bc + +Mac OSX + +libdevice.10.bc + +Linux + +libdevice.10.bc + +Component + +CUDA Occupancy Calculation Header Library + +All + +cuda_occupancy.h + +Component + +CUDA Half Precision Headers + +All + +cuda_fp16.h, cuda_fp16.hpp + +Component + +CUDA Profiling Tools Interface (CUPTI) Library + +Windows + +cupti.dll + +Mac OSX + +libcupti.dylib + +Linux + +libcupti.so + +Component + +NVIDIA Tools Extension Library + +Windows + +nvToolsExt.dll, nvToolsExt.lib + +Mac OSX + +libnvToolsExt.dylib + +Linux + +libnvToolsExt.so + +Component + +NVIDIA CUDA Driver Libraries + +Linux + +libcuda.so, libnvidia-fatbinaryloader.so, +libnvidia-ptxjitcompiler.so + +The NVIDIA CUDA Driver Libraries are only distributable in +applications that meet this criteria: + + 1. The application was developed starting from a NVIDIA CUDA + container obtained from Docker Hub or the NVIDIA GPU + Cloud, and + + 2. The resulting application is packaged as a Docker + container and distributed to users on Docker Hub or the + NVIDIA GPU Cloud only. + + +2.7. Attachment B + + +Additional Licensing Obligations + +The following third party components included in the SOFTWARE +are licensed to Licensee pursuant to the following terms and +conditions: + + 1. Licensee's use of the GDB third party component is + subject to the terms and conditions of GNU GPL v3: + + This product includes copyrighted third-party software licensed + under the terms of the GNU General Public License v3 ("GPL v3"). + All third-party software packages are copyright by their respective + authors. GPL v3 terms and conditions are hereby incorporated into + the Agreement by this reference: http://www.gnu.org/licenses/gpl.txt + + Consistent with these licensing requirements, the software + listed below is provided under the terms of the specified + open source software licenses. To obtain source code for + software provided under licenses that require + redistribution of source code, including the GNU General + Public License (GPL) and GNU Lesser General Public License + (LGPL), contact oss-requests@nvidia.com. This offer is + valid for a period of three (3) years from the date of the + distribution of this product by NVIDIA CORPORATION. + + Component License + CUDA-GDB GPL v3 + + 2. Licensee represents and warrants that any and all third + party licensing and/or royalty payment obligations in + connection with Licensee's use of the H.264 video codecs + are solely the responsibility of Licensee. + + 3. Licensee's use of the Thrust library is subject to the + terms and conditions of the Apache License Version 2.0. + All third-party software packages are copyright by their + respective authors. Apache License Version 2.0 terms and + conditions are hereby incorporated into the Agreement by + this reference. + http://www.apache.org/licenses/LICENSE-2.0.html + + In addition, Licensee acknowledges the following notice: + Thrust includes source code from the Boost Iterator, + Tuple, System, and Random Number libraries. + + Boost Software License - Version 1.0 - August 17th, 2003 + . . . . + + Permission is hereby granted, free of charge, to any person or + organization obtaining a copy of the software and accompanying + documentation covered by this license (the "Software") to use, + reproduce, display, distribute, execute, and transmit the Software, + and to prepare derivative works of the Software, and to permit + third-parties to whom the Software is furnished to do so, all + subject to the following: + + The copyright notices in the Software and this entire statement, + including the above license grant, this restriction and the following + disclaimer, must be included in all copies of the Software, in whole + or in part, and all derivative works of the Software, unless such + copies or derivative works are solely in the form of machine-executable + object code generated by a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND + NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR + ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR + OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + 4. Licensee's use of the LLVM third party component is + subject to the following terms and conditions: + + ====================================================== + LLVM Release License + ====================================================== + University of Illinois/NCSA + Open Source License + + Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. + All rights reserved. + + Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal with the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at Urbana- + Champaign, nor the names of its contributors may be used to endorse or + promote products derived from this Software without specific prior + written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS WITH THE SOFTWARE. + + 5. Licensee's use (e.g. nvprof) of the PCRE third party + component is subject to the following terms and + conditions: + + ------------ + PCRE LICENCE + ------------ + PCRE is a library of functions to support regular expressions whose syntax + and semantics are as close as possible to those of the Perl 5 language. + Release 8 of PCRE is distributed under the terms of the "BSD" licence, as + specified below. The documentation for PCRE, supplied in the "doc" + directory, is distributed under the same terms as the software itself. The + basic library functions are written in C and are freestanding. Also + included in the distribution is a set of C++ wrapper functions, and a just- + in-time compiler that can be used to optimize pattern matching. These are + both optional features that can be omitted when the library is built. + + THE BASIC LIBRARY FUNCTIONS + --------------------------- + Written by: Philip Hazel + Email local part: ph10 + Email domain: cam.ac.uk + University of Cambridge Computing Service, + Cambridge, England. + Copyright (c) 1997-2012 University of Cambridge + All rights reserved. + + PCRE JUST-IN-TIME COMPILATION SUPPORT + ------------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Emain domain: freemail.hu + Copyright(c) 2010-2012 Zoltan Herczeg + All rights reserved. + + STACK-LESS JUST-IN-TIME COMPILER + -------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Emain domain: freemail.hu + Copyright(c) 2009-2012 Zoltan Herczeg + All rights reserved. + + THE C++ WRAPPER FUNCTIONS + ------------------------- + Contributed by: Google Inc. + Copyright (c) 2007-2012, Google Inc. + All rights reserved. + + THE "BSD" LICENCE + ----------------- + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the name of Google + Inc. nor the names of their contributors may be used to endorse or + promote products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + 6. Some of the cuBLAS library routines were written by or + derived from code written by Vasily Volkov and are subject + to the Modified Berkeley Software Distribution License as + follows: + + Copyright (c) 2007-2009, Regents of the University of California + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the University of California, Berkeley nor + the names of its contributors may be used to endorse or promote + products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + 7. Some of the cuBLAS library routines were written by or + derived from code written by Davide Barbieri and are + subject to the Modified Berkeley Software Distribution + License as follows: + + Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * The name of the author may not be used to endorse or promote + products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + 8. Some of the cuBLAS library routines were derived from + code developed by the University of Tennessee and are + subject to the Modified Berkeley Software Distribution + License as follows: + + Copyright (c) 2010 The University of Tennessee. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer listed in this license in the documentation and/or + other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 9. Some of the cuBLAS library routines were written by or + derived from code written by Jonathan Hogg and are subject + to the Modified Berkeley Software Distribution License as + follows: + + Copyright (c) 2012, The Science and Technology Facilities Council (STFC). + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the STFC nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 10. Some of the cuBLAS library routines were written by or + derived from code written by Ahmad M. Abdelfattah, David + Keyes, and Hatem Ltaief, and are subject to the Apache + License, Version 2.0, as follows: + + -- (C) Copyright 2013 King Abdullah University of Science and Technology + Authors: + Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa) + David Keyes (david.keyes@kaust.edu.sa) + Hatem Ltaief (hatem.ltaief@kaust.edu.sa) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the King Abdullah University of Science and + Technology nor the names of its contributors may be used to endorse + or promote products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE + + 11. Some of the cuSPARSE library routines were written by or + derived from code written by Li-Wen Chang and are subject + to the NCSA Open Source License as follows: + + Copyright (c) 2012, University of Illinois. + + All rights reserved. + + Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal with the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimers in the documentation and/or other materials provided + with the distribution. + * Neither the names of IMPACT Group, University of Illinois, nor + the names of its contributors may be used to endorse or promote + products derived from this Software without specific prior + written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + 12. Some of the cuRAND library routines were written by or + derived from code written by Mutsuo Saito and Makoto + Matsumoto and are subject to the following license: + + Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima + University. All rights reserved. + + Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima + University and University of Tokyo. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the Hiroshima University nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 13. Some of the cuRAND library routines were derived from + code developed by D. E. Shaw Research and are subject to + the following license: + + Copyright 2010-2011, D. E. Shaw Research. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions, and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 14. Some of the Math library routines were written by or + derived from code developed by Norbert Juffa and are + subject to the following license: + + Copyright (c) 2015-2017, Norbert Juffa + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 15. Licensee's use of the lz4 third party component is + subject to the following terms and conditions: + + Copyright (C) 2011-2013, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 16. The NPP library uses code from the Boost Math Toolkit, + and is subject to the following license: + + Boost Software License - Version 1.0 - August 17th, 2003 + . . . . + + Permission is hereby granted, free of charge, to any person or + organization obtaining a copy of the software and accompanying + documentation covered by this license (the "Software") to use, + reproduce, display, distribute, execute, and transmit the Software, + and to prepare derivative works of the Software, and to permit + third-parties to whom the Software is furnished to do so, all + subject to the following: + + The copyright notices in the Software and this entire statement, + including the above license grant, this restriction and the following + disclaimer, must be included in all copies of the Software, in whole + or in part, and all derivative works of the Software, unless such + copies or derivative works are solely in the form of machine-executable + object code generated by a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND + NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR + ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR + OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + 17. Portions of the Nsight Eclipse Edition is subject to the + following license: + + The Eclipse Foundation makes available all content in this plug-in + ("Content"). Unless otherwise indicated below, the Content is provided + to you under the terms and conditions of the Eclipse Public License + Version 1.0 ("EPL"). A copy of the EPL is available at http:// + www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program" + will mean the Content. + + If you did not receive this Content directly from the Eclipse + Foundation, the Content is being redistributed by another party + ("Redistributor") and different terms and conditions may apply to your + use of any object code in the Content. Check the Redistributor's + license that was provided with the Content. If no such license exists, + contact the Redistributor. Unless otherwise indicated below, the terms + and conditions of the EPL still apply to any source code in the + Content and such source code may be obtained at http://www.eclipse.org. + + 18. Some of the cuBLAS library routines uses code from + OpenAI, which is subject to the following license: + + License URL + https://github.com/openai/openai-gemm/blob/master/LICENSE + + License Text + The MIT License + + Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + 19. Licensee's use of the Visual Studio Setup Configuration + Samples is subject to the following license: + + The MIT License (MIT) + Copyright (C) Microsoft Corporation. All rights reserved. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + 20. Licensee's use of linmath.h header for CPU functions for + GL vector/matrix operations from lunarG is subject to the + Apache License Version 2.0. + + 21. The DX12-CUDA sample uses the d3dx12.h header, which is + subject to the MIT license . + +----------------- +``` + +### URLs + - `Homepage`: https://developer.nvidia.com/cuda-zone + + +## nvidia-cuda-runtime (13.0.48) + +### Licenses +License: `LicenseRef-NVIDIA-Proprietary` + + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -17134,67 +25123,67 @@ License: `Apache-2.0` -## nvidia-cudnn-cu12 (9.10.2.21) +## nvidia-cudnn-cu13 (9.13.0.50) ### Licenses -License: `LicenseRef-NVIDIA-Proprietary` +License: `None` - `licenses/License.txt`: ``` LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS -This license agreement, including exhibits attached ("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of a NVIDIA software development kit (“SDK”). +This license agreement, including exhibits attached ("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of a NVIDIA software development kit (“SDK”). -Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation. +Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation. -This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used. +This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used. -If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent. +If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent. -If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK. +If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK. You agree to use the SDK only for purposes that are permitted by (a) this Agreement, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. -1. License. +1. License. 1.1 Grant -Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to: +Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to: (i) Install and use the SDK, (ii) Modify and create derivative works of sample source code delivered in the SDK, and - + (iii) Distribute those portions of the SDK that are identified in this Agreement as distributable, as incorporated in object code format into a software application that meets the distribution requirements indicated in this Agreement. 1.2 Distribution Requirements These are the distribution requirements for you to exercise the distribution grant: - + (i) Your application must have material additional functionality, beyond the included portions of the SDK. -(ii) The distributable portions of the SDK shall only be accessed by your application. +(ii) The distributable portions of the SDK shall only be accessed by your application. (iii) The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.” (iv) Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only. -(v) The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. +(v) The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. (vi) You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SDK not in compliance with the requirements of this Agreement, and to enforce the terms of your agreements with respect to distributed SDK. 1.3 Authorized Users -You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf. +You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf. -If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network. +If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network. -You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences. +You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences. -1.4 Pre-Release SDK -The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss. -You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems. -NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability. +1.4 Pre-Release SDK +The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss. +You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems. +NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability. 1.5 Updates NVIDIA may, at its option, make available patches, workarounds or other updates to this SDK. Unless the updates are provided with their separate governing terms, they are deemed part of the SDK licensed to you as provided in this Agreement. @@ -17209,65 +25198,65 @@ The SDK may come bundled with, or otherwise include or be distributed with, thir NVIDIA reserves all rights, title and interest in and to the SDK not expressly granted to you under this Agreement. -2. Limitations. +2. Limitations. The following license limitations apply to your use of the SDK: -2.1 You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK. +2.1 You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK. -2.2 Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. +2.2 Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. -2.3 Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA. +2.3 Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA. -2.4 You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK. +2.4 You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK. 2.5 You may not use the SDK in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SDK be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -2.6 Unless you have an agreement with NVIDIA for this purpose, you may not use the SDK with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SDK for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +2.6 Unless you have an agreement with NVIDIA for this purpose, you may not use the SDK with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SDK for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. 2.7 You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to your use of the SDK outside of the scope of this Agreement, or not in compliance with its terms. -3. Ownership. +3. Ownership. -3.1 NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications and derivative works, including their respective intellectual property rights, subject to your rights under Section 3.2. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights. +3.1 NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications and derivative works, including their respective intellectual property rights, subject to your rights under Section 3.2. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights. 3.2 You hold all rights, title and interest in and to your applications and your derivative works of the sample source code delivered in the SDK, including their respective intellectual property rights, subject to NVIDIA’s rights under section 3.1. 3.3 You may, but don’t have to, provide to NVIDIA suggestions, feature requests or other feedback regarding the SDK, including possible enhancements or modifications to the SDK. For any feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) it without the payment of any royalties or fees to you. NVIDIA will use feedback at its choice. NVIDIA is constantly looking for ways to improve its products, so you may send feedback to NVIDIA through the developer portal at https://developer.nvidia.com. -4. No Warranties. +4. No Warranties. -THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE. +THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE. -5. Limitations of Liability. +5. Limitations of Liability. -TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different. +These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different. -6. Termination. +6. Termination. -6.1 This Agreement will continue to apply until terminated by either you or NVIDIA as described below. +6.1 This Agreement will continue to apply until terminated by either you or NVIDIA as described below. -6.2 If you want to terminate this Agreement, you may do so by stopping to use the SDK. +6.2 If you want to terminate this Agreement, you may do so by stopping to use the SDK. -6.3 NVIDIA may, at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); (ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or (iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. +6.3 NVIDIA may, at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); (ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or (iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. -6.4 Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the licenses granted to you. +6.4 Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the licenses granted to you. -7. General. - -If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified. +7. General. + +If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified. You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement. This Agreement will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. -The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, such provision will be construed as limited to the extent necessary to be consistent with and fully enforceable under the law and the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative. -Each party acknowledges and agrees that the other is an independent contractor in the performance of this Agreement. +Each party acknowledges and agrees that the other is an independent contractor in the performance of this Agreement. The SDK has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this Agreement pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. @@ -17284,11 +25273,11 @@ cuDNN SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT K The terms in this supplement govern your use of the NVIDIA cuDNN SDK under the terms of your license agreement (“Agreement”) as modified by this supplement. Capitalized terms used but not defined below have the meaning assigned to them in the Agreement. -This supplement is an exhibit to the Agreement and is incorporated as an integral part of the Agreement. In the event of conflict between the terms in this supplement and the terms in the Agreement, the terms in this supplement govern. +This supplement is an exhibit to the Agreement and is incorporated as an integral part of the Agreement. In the event of conflict between the terms in this supplement and the terms in the Agreement, the terms in this supplement govern. 4.1 License Scope. The SDK is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -2. Distribution. The following portions of the SDK are distributable under the Agreement: the runtime files .so and .h, cudnn64_7.dll, and cudnn.lib. +2. Distribution. The following portions of the SDK are distributable under the Agreement: the runtime files .so and .h, cudnn64_7.dll, and cudnn.lib. In addition to the rights above, for parties that are developing software intended solely for use on Jetson development kits or Jetson modules and running Linux for Tegra software the following shall apply: the SDK may be distributed in its entirety, as provided by NVIDIA and without separation of its components, for you and/or your licensees to create software development kits for use only on the Jetson platform and running Linux for Tegra software. @@ -17301,7 +25290,7 @@ In addition to the rights above, for parties that are developing software intend - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cudnn-frontend (1.15.0) +## nvidia-cudnn-frontend (1.16.0) ### Licenses License: `NVIDIA Proprietary Software` @@ -17328,7 +25317,7 @@ License: `NVIDIA Proprietary Software` * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. - */ + */ ``` ### URLs @@ -17336,12 +25325,12 @@ License: `NVIDIA Proprietary Software` - `Homepage`: https://github.com/nvidia/cudnn-frontend -## nvidia-cufft-cu12 (11.3.3.83) +## nvidia-cufft (12.0.0.15) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -18917,12 +26906,12 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cufile-cu12 (1.13.1.3) +## nvidia-cufile (1.15.0.42) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -20498,12 +28487,12 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-curand-cu12 (10.3.9.90) +## nvidia-curand (10.4.0.35) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -22079,12 +30068,12 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cusolver-cu12 (11.7.3.90) +## nvidia-cusolver (12.0.3.29) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -23660,12 +31649,12 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cusparse-cu12 (12.5.8.93) +## nvidia-cusparse (12.6.2.49) ### Licenses -License: `NVIDIA Proprietary Software` +License: `LicenseRef-NVIDIA-Proprietary` - - `License.txt`: + - `licenses/License.txt`: ``` End User License Agreement -------------------------- @@ -25241,7 +33230,7 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-cusparselt-cu12 (0.7.1) +## nvidia-cusparselt-cu13 (0.8.0) ### Licenses License: `NVIDIA Proprietary Software` @@ -25250,12 +33239,12 @@ License: `NVIDIA Proprietary Software` - `Homepage`: https://developer.nvidia.com/cusparselt -## nvidia-cutlass-dsl (4.3.1) +## nvidia-cutlass-dsl (4.2.1) ### Licenses License: `None` - - `licenses/LICENSE`: + - `LICENSE`: ``` NVIDIA Software License Agreement @@ -25462,14 +33451,14 @@ License: `BSD` - `Homepage`: https://forums.developer.nvidia.com -## nvidia-modelopt (0.33.1) +## nvidia-modelopt (0.37.0) ### Licenses License: `Apache 2.0` - - `licenses/LICENSE`: + - `licenses/LICENSE_HEADER`: ``` -SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 Licensed under the Apache License, Version 2.0 (the "License"); @@ -25489,6 +33478,7 @@ limitations under the License. - `Homepage`: https://github.com/NVIDIA/Model-Optimizer +<<<<<<< HEAD ## nvidia-modelopt-core (0.33.1) ### Licenses @@ -25517,6 +33507,9 @@ limitations under the License. ## nvidia-nccl-cu12 (2.27.3) +======= +## nvidia-nccl-cu13 (2.27.7) +>>>>>>> fd7624b32 (modify ATTRIBUTIONS-Python.md) ### Licenses License: `BSD-3-Clause` @@ -25568,65 +33561,13 @@ for more information and license details. - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-nccl-cu13 (2.28.3) +## nvidia-nvjitlink (13.0.39) ### Licenses -License: `None` +License: `LicenseRef-NVIDIA-Proprietary` - `licenses/License.txt`: ``` - - Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National - Laboratory, the U.S. Department of Energy, nor the names of their - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - The U.S. Department of Energy funded the development of this software - under subcontract 7078610 with Lawrence Berkeley National Laboratory. - - -This code also includes files from the NVIDIA Tools Extension SDK project. - -See: - - https://github.com/NVIDIA/NVTX - -for more information and license details. -``` - -### URLs - - `Homepage`: https://developer.nvidia.com/cuda-zone - - -## nvidia-nvjitlink-cu12 (12.8.93) - -### Licenses -License: `NVIDIA Proprietary Software` - - - `License.txt`: -``` End User License Agreement -------------------------- @@ -27201,12 +35142,1593 @@ conditions: - `Homepage`: https://developer.nvidia.com/cuda-zone -## nvidia-nvtx-cu12 (12.8.90) +## nvidia-nvshmem-cu13 (3.3.24) + +### Licenses +License: `BSD-3-Clause` + + - `licenses/License.txt`: +``` +End User License Agreement +-------------------------- + + +Preface +------- + +The Software License Agreement in Chapter 1 and the Supplement +in Chapter 2 contain license terms and conditions that govern +the use of NVIDIA software. By accepting this agreement, you +agree to comply with all the terms and conditions applicable +to the product(s) included herein. + + +NVIDIA Driver + + +Description + +This package contains the operating system driver and +fundamental system software components for NVIDIA GPUs. + + +NVIDIA CUDA Toolkit + + +Description + +The NVIDIA CUDA Toolkit provides command-line and graphical +tools for building, debugging and optimizing the performance +of applications accelerated by NVIDIA GPUs, runtime and math +libraries, and documentation including programming guides, +user manuals, and API references. + + +Default Install Location of CUDA Toolkit + +Windows platform: + +%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.# + +Linux platform: + +/usr/local/cuda-#.# + +Mac platform: + +/Developer/NVIDIA/CUDA-#.# + + +NVIDIA CUDA Samples + + +Description + +This package includes over 100+ CUDA examples that demonstrate +various CUDA programming principles, and efficient CUDA +implementation of algorithms in specific application domains. + + +Default Install Location of CUDA Samples + +Windows platform: + +%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.# + +Linux platform: + +/usr/local/cuda-#.#/samples + +and + +$HOME/NVIDIA_CUDA-#.#_Samples + +Mac platform: + +/Developer/NVIDIA/CUDA-#.#/samples + + +NVIDIA Nsight Visual Studio Edition (Windows only) + + +Description + +NVIDIA Nsight Development Platform, Visual Studio Edition is a +development environment integrated into Microsoft Visual +Studio that provides tools for debugging, profiling, analyzing +and optimizing your GPU computing and graphics applications. + + +Default Install Location of Nsight Visual Studio Edition + +Windows platform: + +%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.# + + +1. License Agreement for NVIDIA Software Development Kits +--------------------------------------------------------- + + +Release Date: July 26, 2018 +--------------------------- + + +Important NoticeRead before downloading, installing, +copying or using the licensed software: +------------------------------------------------------- + +This license agreement, including exhibits attached +("Agreement”) is a legal agreement between you and NVIDIA +Corporation ("NVIDIA") and governs your use of a NVIDIA +software development kit (“SDK”). + +Each SDK has its own set of software and materials, but here +is a description of the types of items that may be included in +a SDK: source code, header files, APIs, data sets and assets +(examples include images, textures, models, scenes, videos, +native API input/output files), binary software, sample code, +libraries, utility programs, programming code and +documentation. + +This Agreement can be accepted only by an adult of legal age +of majority in the country in which the SDK is used. + +If you are entering into this Agreement on behalf of a company +or other legal entity, you represent that you have the legal +authority to bind the entity to this Agreement, in which case +“you” will mean the entity you represent. + +If you don’t have the required age or authority to accept +this Agreement, or if you don’t accept all the terms and +conditions of this Agreement, do not download, install or use +the SDK. + +You agree to use the SDK only for purposes that are permitted +by (a) this Agreement, and (b) any applicable law, regulation +or generally accepted practices or guidelines in the relevant +jurisdictions. + + +1.1. License + + +1.1.1. License Grant + +Subject to the terms of this Agreement, NVIDIA hereby grants +you a non-exclusive, non-transferable license, without the +right to sublicense (except as expressly provided in this +Agreement) to: + + 1. Install and use the SDK, + + 2. Modify and create derivative works of sample source code + delivered in the SDK, and + + 3. Distribute those portions of the SDK that are identified + in this Agreement as distributable, as incorporated in + object code format into a software application that meets + the distribution requirements indicated in this Agreement. + + +1.1.2. Distribution Requirements + +These are the distribution requirements for you to exercise +the distribution grant: + + 1. Your application must have material additional + functionality, beyond the included portions of the SDK. + + 2. The distributable portions of the SDK shall only be + accessed by your application. + + 3. The following notice shall be included in modifications + and derivative works of sample source code distributed: + “This software contains source code provided by NVIDIA + Corporation.” + + 4. Unless a developer tool is identified in this Agreement + as distributable, it is delivered for your internal use + only. + + 5. The terms under which you distribute your application + must be consistent with the terms of this Agreement, + including (without limitation) terms relating to the + license grant and license restrictions and protection of + NVIDIA’s intellectual property rights. Additionally, you + agree that you will protect the privacy, security and + legal rights of your application users. + + 6. You agree to notify NVIDIA in writing of any known or + suspected distribution or use of the SDK not in compliance + with the requirements of this Agreement, and to enforce + the terms of your agreements with respect to distributed + SDK. + + +1.1.3. Authorized Users + +You may allow employees and contractors of your entity or of +your subsidiary(ies) to access and use the SDK from your +secure network to perform work on your behalf. + +If you are an academic institution you may allow users +enrolled or employed by the academic institution to access and +use the SDK from your secure network. + +You are responsible for the compliance with the terms of this +Agreement by your authorized users. If you become aware that +your authorized users didn’t follow the terms of this +Agreement, you agree to take reasonable steps to resolve the +non-compliance and prevent new occurrences. + + +1.1.4. Pre-Release SDK + +The SDK versions identified as alpha, beta, preview or +otherwise as pre-release, may not be fully functional, may +contain errors or design flaws, and may have reduced or +different security, privacy, accessibility, availability, and +reliability standards relative to commercial versions of +NVIDIA software and materials. Use of a pre-release SDK may +result in unexpected results, loss of data, project delays or +other unpredictable damage or loss. + +You may use a pre-release SDK at your own risk, understanding +that pre-release SDKs are not intended for use in production +or business-critical systems. + +NVIDIA may choose not to make available a commercial version +of any pre-release SDK. NVIDIA may also choose to abandon +development and terminate the availability of a pre-release +SDK at any time without liability. + + +1.1.5. Updates + +NVIDIA may, at its option, make available patches, workarounds +or other updates to this SDK. Unless the updates are provided +with their separate governing terms, they are deemed part of +the SDK licensed to you as provided in this Agreement. You +agree that the form and content of the SDK that NVIDIA +provides may change without prior notice to you. While NVIDIA +generally maintains compatibility between versions, NVIDIA may +in some cases make changes that introduce incompatibilities in +future versions of the SDK. + + +1.1.6. Third Party Licenses + +The SDK may come bundled with, or otherwise include or be +distributed with, third party software licensed by a NVIDIA +supplier and/or open source software provided under an open +source license. Use of third party software is subject to the +third-party license terms, or in the absence of third party +terms, the terms of this Agreement. Copyright to third party +software is held by the copyright holders indicated in the +third-party software or license. + + +1.1.7. Reservation of Rights + +NVIDIA reserves all rights, title, and interest in and to the +SDK, not expressly granted to you under this Agreement. + + +1.2. Limitations + +The following license limitations apply to your use of the +SDK: + + 1. You may not reverse engineer, decompile or disassemble, + or remove copyright or other proprietary notices from any + portion of the SDK or copies of the SDK. + + 2. Except as expressly provided in this Agreement, you may + not copy, sell, rent, sublicense, transfer, distribute, + modify, or create derivative works of any portion of the + SDK. For clarity, you may not distribute or sublicense the + SDK as a stand-alone product. + + 3. Unless you have an agreement with NVIDIA for this + purpose, you may not indicate that an application created + with the SDK is sponsored or endorsed by NVIDIA. + + 4. You may not bypass, disable, or circumvent any + encryption, security, digital rights management or + authentication mechanism in the SDK. + + 5. You may not use the SDK in any manner that would cause it + to become subject to an open source software license. As + examples, licenses that require as a condition of use, + modification, and/or distribution that the SDK be: + + a. Disclosed or distributed in source code form; + + b. Licensed for the purpose of making derivative works; + or + + c. Redistributable at no charge. + + 6. Unless you have an agreement with NVIDIA for this + purpose, you may not use the SDK with any system or + application where the use or failure of the system or + application can reasonably be expected to threaten or + result in personal injury, death, or catastrophic loss. + Examples include use in avionics, navigation, military, + medical, life support or other life critical applications. + NVIDIA does not design, test or manufacture the SDK for + these critical uses and NVIDIA shall not be liable to you + or any third party, in whole or in part, for any claims or + damages arising from such uses. + + 7. You agree to defend, indemnify and hold harmless NVIDIA + and its affiliates, and their respective employees, + contractors, agents, officers and directors, from and + against any and all claims, damages, obligations, losses, + liabilities, costs or debt, fines, restitutions and + expenses (including but not limited to attorney’s fees + and costs incident to establishing the right of + indemnification) arising out of or related to your use of + the SDK outside of the scope of this Agreement, or not in + compliance with its terms. + + +1.3. Ownership + + 1. NVIDIA or its licensors hold all rights, title and + interest in and to the SDK and its modifications and + derivative works, including their respective intellectual + property rights, subject to your rights described in this + section. This SDK may include software and materials from + NVIDIA’s licensors, and these licensors are intended + third party beneficiaries that may enforce this Agreement + with respect to their intellectual property rights. + + 2. You hold all rights, title and interest in and to your + applications and your derivative works of the sample + source code delivered in the SDK, including their + respective intellectual property rights, subject to + NVIDIA’s rights described in this section. + + 3. You may, but don’t have to, provide to NVIDIA + suggestions, feature requests or other feedback regarding + the SDK, including possible enhancements or modifications + to the SDK. For any feedback that you voluntarily provide, + you hereby grant NVIDIA and its affiliates a perpetual, + non-exclusive, worldwide, irrevocable license to use, + reproduce, modify, license, sublicense (through multiple + tiers of sublicensees), and distribute (through multiple + tiers of distributors) it without the payment of any + royalties or fees to you. NVIDIA will use feedback at its + choice. NVIDIA is constantly looking for ways to improve + its products, so you may send feedback to NVIDIA through + the developer portal at https://developer.nvidia.com. + + +1.4. No Warranties + +THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL +FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND +ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND +OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, +BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE +ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO +WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF +DEALING OR COURSE OF TRADE. + + +1.5. Limitation of Liability + +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS +AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, +PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS +OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF +PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION +WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, +WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH +OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), +PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF +LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES +TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS +AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE +NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS +LIMIT. + +These exclusions and limitations of liability shall apply +regardless if NVIDIA or its affiliates have been advised of +the possibility of such damages, and regardless of whether a +remedy fails its essential purpose. These exclusions and +limitations of liability form an essential basis of the +bargain between the parties, and, absent any of these +exclusions or limitations of liability, the provisions of this +Agreement, including, without limitation, the economic terms, +would be substantially different. + + +1.6. Termination + + 1. This Agreement will continue to apply until terminated by + either you or NVIDIA as described below. + + 2. If you want to terminate this Agreement, you may do so by + stopping to use the SDK. + + 3. NVIDIA may, at any time, terminate this Agreement if: + + a. (i) you fail to comply with any term of this + Agreement and the non-compliance is not fixed within + thirty (30) days following notice from NVIDIA (or + immediately if you violate NVIDIA’s intellectual + property rights); + + b. (ii) you commence or participate in any legal + proceeding against NVIDIA with respect to the SDK; or + + c. (iii) NVIDIA decides to no longer provide the SDK in + a country or, in NVIDIA’s sole discretion, the + continued use of it is no longer commercially viable. + + 4. Upon any termination of this Agreement, you agree to + promptly discontinue use of the SDK and destroy all copies + in your possession or control. Your prior distributions in + accordance with this Agreement are not affected by the + termination of this Agreement. Upon written request, you + will certify in writing that you have complied with your + commitments under this section. Upon any termination of + this Agreement all provisions survive except for the + license grant provisions. + + +1.7. General + +If you wish to assign this Agreement or your rights and +obligations, including by merger, consolidation, dissolution +or operation of law, contact NVIDIA to ask for permission. Any +attempted assignment not approved by NVIDIA in writing shall +be void and of no effect. NVIDIA may assign, delegate or +transfer this Agreement and its rights and obligations, and if +to a non-affiliate you will be notified. + +You agree to cooperate with NVIDIA and provide reasonably +requested information to verify your compliance with this +Agreement. + +This Agreement will be governed in all respects by the laws of +the United States and of the State of Delaware as those laws +are applied to contracts entered into and performed entirely +within Delaware by Delaware residents, without regard to the +conflicts of laws principles. The United Nations Convention on +Contracts for the International Sale of Goods is specifically +disclaimed. You agree to all terms of this Agreement in the +English language. + +The state or federal courts residing in Santa Clara County, +California shall have exclusive jurisdiction over any dispute +or claim arising out of this Agreement. Notwithstanding this, +you agree that NVIDIA shall still be allowed to apply for +injunctive remedies or an equivalent type of urgent legal +relief in any jurisdiction. + +If any court of competent jurisdiction determines that any +provision of this Agreement is illegal, invalid or +unenforceable, such provision will be construed as limited to +the extent necessary to be consistent with and fully +enforceable under the law and the remaining provisions will +remain in full force and effect. Unless otherwise specified, +remedies are cumulative. + +Each party acknowledges and agrees that the other is an +independent contractor in the performance of this Agreement. + +The SDK has been developed entirely at private expense and is +“commercial items” consisting of “commercial computer +software” and “commercial computer software +documentation” provided with RESTRICTED RIGHTS. Use, +duplication or disclosure by the U.S. Government or a U.S. +Government subcontractor is subject to the restrictions in +this Agreement pursuant to DFARS 227.7202-3(a) or as set forth +in subparagraphs (c)(1) and (2) of the Commercial Computer +Software - Restricted Rights clause at FAR 52.227-19, as +applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas +Expressway, Santa Clara, CA 95051. + +The SDK is subject to United States export laws and +regulations. You agree that you will not ship, transfer or +export the SDK into any country, or use the SDK in any manner, +prohibited by the United States Bureau of Industry and +Security or economic sanctions regulations administered by the +U.S. Department of Treasury’s Office of Foreign Assets +Control (OFAC), or any applicable export laws, restrictions or +regulations. These laws include restrictions on destinations, +end users and end use. By accepting this Agreement, you +confirm that you are not a resident or citizen of any country +currently embargoed by the U.S. and that you are not otherwise +prohibited from receiving the SDK. + +Any notice delivered by NVIDIA to you under this Agreement +will be delivered via mail, email or fax. You agree that any +notices that NVIDIA sends you electronically will satisfy any +legal communication requirements. Please direct your legal +notices or other correspondence to NVIDIA Corporation, 2788 +San Tomas Expressway, Santa Clara, California 95051, United +States of America, Attention: Legal Department. + +This Agreement and any exhibits incorporated into this +Agreement constitute the entire agreement of the parties with +respect to the subject matter of this Agreement and supersede +all prior negotiations or documentation exchanged between the +parties relating to this SDK license. Any additional and/or +conflicting terms on documents issued by you are null, void, +and invalid. Any amendment or waiver under this Agreement +shall be in writing and signed by representatives of both +parties. + + +2. CUDA Toolkit Supplement to Software License Agreement for +NVIDIA Software Development Kits +------------------------------------------------------------ + + +Release date: August 16, 2018 +----------------------------- + +The terms in this supplement govern your use of the NVIDIA +CUDA Toolkit SDK under the terms of your license agreement +(“Agreement”) as modified by this supplement. Capitalized +terms used but not defined below have the meaning assigned to +them in the Agreement. + +This supplement is an exhibit to the Agreement and is +incorporated as an integral part of the Agreement. In the +event of conflict between the terms in this supplement and the +terms in the Agreement, the terms in this supplement govern. + + +2.1. License Scope + +The SDK is licensed for you to develop applications only for +use in systems with NVIDIA GPUs. + + +2.2. Distribution + +The portions of the SDK that are distributable under the +Agreement are listed in Attachment A. + + +2.3. Operating Systems + +Those portions of the SDK designed exclusively for use on the +Linux or FreeBSD operating systems, or other operating systems +derived from the source code to these operating systems, may +be copied and redistributed for use in accordance with this +Agreement, provided that the object code files are not +modified in any way (except for unzipping of compressed +files). + + +2.4. Audio and Video Encoders and Decoders + +You acknowledge and agree that it is your sole responsibility +to obtain any additional third-party licenses required to +make, have made, use, have used, sell, import, and offer for +sale your products or services that include or incorporate any +third-party software and content relating to audio and/or +video encoders and decoders from, including but not limited +to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A., +MPEG-LA, and Coding Technologies. NVIDIA does not grant to you +under this Agreement any necessary patent or other rights with +respect to any audio and/or video encoders and decoders. + + +2.5. Licensing + +If the distribution terms in this Agreement are not suitable +for your organization, or for any questions regarding this +Agreement, please contact NVIDIA at +nvidia-compute-license-questions@nvidia.com. + + +2.6. Attachment A + +The following portions of the SDK are distributable under the +Agreement: + +Component + +CUDA Runtime + +Windows + +cudart.dll, cudart_static.lib, cudadevrt.lib + +Mac OSX + +libcudart.dylib, libcudart_static.a, libcudadevrt.a + +Linux + +libcudart.so, libcudart_static.a, libcudadevrt.a + +Android + +libcudart.so, libcudart_static.a, libcudadevrt.a + +Component + +CUDA FFT Library + +Windows + +cufft.dll, cufftw.dll, cufft.lib, cufftw.lib + +Mac OSX + +libcufft.dylib, libcufft_static.a, libcufftw.dylib, +libcufftw_static.a + +Linux + +libcufft.so, libcufft_static.a, libcufftw.so, +libcufftw_static.a + +Android + +libcufft.so, libcufft_static.a, libcufftw.so, +libcufftw_static.a + +Component + +CUDA BLAS Library + +Windows + +cublas.dll, cublasLt.dll + +Mac OSX + +libcublas.dylib, libcublasLt.dylib, libcublas_static.a, +libcublasLt_static.a + +Linux + +libcublas.so, libcublasLt.so, libcublas_static.a, +libcublasLt_static.a + +Android + +libcublas.so, libcublasLt.so, libcublas_static.a, +libcublasLt_static.a + +Component + +NVIDIA "Drop-in" BLAS Library + +Windows + +nvblas.dll + +Mac OSX + +libnvblas.dylib + +Linux + +libnvblas.so + +Component + +CUDA Sparse Matrix Library + +Windows + +cusparse.dll, cusparse.lib + +Mac OSX + +libcusparse.dylib, libcusparse_static.a + +Linux + +libcusparse.so, libcusparse_static.a + +Android + +libcusparse.so, libcusparse_static.a + +Component + +CUDA Linear Solver Library + +Windows + +cusolver.dll, cusolver.lib + +Mac OSX + +libcusolver.dylib, libcusolver_static.a + +Linux + +libcusolver.so, libcusolver_static.a + +Android + +libcusolver.so, libcusolver_static.a + +Component + +CUDA Random Number Generation Library + +Windows + +curand.dll, curand.lib + +Mac OSX + +libcurand.dylib, libcurand_static.a + +Linux + +libcurand.so, libcurand_static.a + +Android + +libcurand.so, libcurand_static.a + +Component + +CUDA Accelerated Graph Library + +Component + +NVIDIA Performance Primitives Library + +Windows + +nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll, +nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll, +nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib, +nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll, +nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib + +Mac OSX + +libnppc.dylib, libnppc_static.a, libnppial.dylib, +libnppial_static.a, libnppicc.dylib, libnppicc_static.a, +libnppicom.dylib, libnppicom_static.a, libnppidei.dylib, +libnppidei_static.a, libnppif.dylib, libnppif_static.a, +libnppig.dylib, libnppig_static.a, libnppim.dylib, +libnppisu_static.a, libnppitc.dylib, libnppitc_static.a, +libnpps.dylib, libnpps_static.a + +Linux + +libnppc.so, libnppc_static.a, libnppial.so, +libnppial_static.a, libnppicc.so, libnppicc_static.a, +libnppicom.so, libnppicom_static.a, libnppidei.so, +libnppidei_static.a, libnppif.so, libnppif_static.a +libnppig.so, libnppig_static.a, libnppim.so, +libnppim_static.a, libnppist.so, libnppist_static.a, +libnppisu.so, libnppisu_static.a, libnppitc.so +libnppitc_static.a, libnpps.so, libnpps_static.a + +Android + +libnppc.so, libnppc_static.a, libnppial.so, +libnppial_static.a, libnppicc.so, libnppicc_static.a, +libnppicom.so, libnppicom_static.a, libnppidei.so, +libnppidei_static.a, libnppif.so, libnppif_static.a +libnppig.so, libnppig_static.a, libnppim.so, +libnppim_static.a, libnppist.so, libnppist_static.a, +libnppisu.so, libnppisu_static.a, libnppitc.so +libnppitc_static.a, libnpps.so, libnpps_static.a + +Component + +NVIDIA JPEG Library + +Linux + +libnvjpeg.so, libnvjpeg_static.a + +Component + +Internal common library required for statically linking to +cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP + +Mac OSX + +libculibos.a + +Linux + +libculibos.a + +Component + +NVIDIA Runtime Compilation Library and Header + +All + +nvrtc.h + +Windows + +nvrtc.dll, nvrtc-builtins.dll + +Mac OSX + +libnvrtc.dylib, libnvrtc-builtins.dylib + +Linux + +libnvrtc.so, libnvrtc-builtins.so + +Component + +NVIDIA Optimizing Compiler Library + +Windows + +nvvm.dll + +Mac OSX + +libnvvm.dylib + +Linux + +libnvvm.so + +Component + +NVIDIA Common Device Math Functions Library + +Windows + +libdevice.10.bc + +Mac OSX + +libdevice.10.bc + +Linux + +libdevice.10.bc + +Component + +CUDA Occupancy Calculation Header Library + +All + +cuda_occupancy.h + +Component + +CUDA Half Precision Headers + +All + +cuda_fp16.h, cuda_fp16.hpp + +Component + +CUDA Profiling Tools Interface (CUPTI) Library + +Windows + +cupti.dll + +Mac OSX + +libcupti.dylib + +Linux + +libcupti.so + +Component + +NVIDIA Tools Extension Library + +Windows + +nvToolsExt.dll, nvToolsExt.lib + +Mac OSX + +libnvToolsExt.dylib + +Linux + +libnvToolsExt.so + +Component + +NVIDIA CUDA Driver Libraries + +Linux + +libcuda.so, libnvidia-fatbinaryloader.so, +libnvidia-ptxjitcompiler.so + +The NVIDIA CUDA Driver Libraries are only distributable in +applications that meet this criteria: + + 1. The application was developed starting from a NVIDIA CUDA + container obtained from Docker Hub or the NVIDIA GPU + Cloud, and + + 2. The resulting application is packaged as a Docker + container and distributed to users on Docker Hub or the + NVIDIA GPU Cloud only. + + +2.7. Attachment B + + +Additional Licensing Obligations + +The following third party components included in the SOFTWARE +are licensed to Licensee pursuant to the following terms and +conditions: + + 1. Licensee's use of the GDB third party component is + subject to the terms and conditions of GNU GPL v3: + + This product includes copyrighted third-party software licensed + under the terms of the GNU General Public License v3 ("GPL v3"). + All third-party software packages are copyright by their respective + authors. GPL v3 terms and conditions are hereby incorporated into + the Agreement by this reference: http://www.gnu.org/licenses/gpl.txt + + Consistent with these licensing requirements, the software + listed below is provided under the terms of the specified + open source software licenses. To obtain source code for + software provided under licenses that require + redistribution of source code, including the GNU General + Public License (GPL) and GNU Lesser General Public License + (LGPL), contact oss-requests@nvidia.com. This offer is + valid for a period of three (3) years from the date of the + distribution of this product by NVIDIA CORPORATION. + + Component License + CUDA-GDB GPL v3 + + 2. Licensee represents and warrants that any and all third + party licensing and/or royalty payment obligations in + connection with Licensee's use of the H.264 video codecs + are solely the responsibility of Licensee. + + 3. Licensee's use of the Thrust library is subject to the + terms and conditions of the Apache License Version 2.0. + All third-party software packages are copyright by their + respective authors. Apache License Version 2.0 terms and + conditions are hereby incorporated into the Agreement by + this reference. + http://www.apache.org/licenses/LICENSE-2.0.html + + In addition, Licensee acknowledges the following notice: + Thrust includes source code from the Boost Iterator, + Tuple, System, and Random Number libraries. + + Boost Software License - Version 1.0 - August 17th, 2003 + . . . . + + Permission is hereby granted, free of charge, to any person or + organization obtaining a copy of the software and accompanying + documentation covered by this license (the "Software") to use, + reproduce, display, distribute, execute, and transmit the Software, + and to prepare derivative works of the Software, and to permit + third-parties to whom the Software is furnished to do so, all + subject to the following: + + The copyright notices in the Software and this entire statement, + including the above license grant, this restriction and the following + disclaimer, must be included in all copies of the Software, in whole + or in part, and all derivative works of the Software, unless such + copies or derivative works are solely in the form of machine-executable + object code generated by a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND + NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR + ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR + OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + 4. Licensee's use of the LLVM third party component is + subject to the following terms and conditions: + + ====================================================== + LLVM Release License + ====================================================== + University of Illinois/NCSA + Open Source License + + Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. + All rights reserved. + + Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal with the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at Urbana- + Champaign, nor the names of its contributors may be used to endorse or + promote products derived from this Software without specific prior + written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS WITH THE SOFTWARE. + + 5. Licensee's use (e.g. nvprof) of the PCRE third party + component is subject to the following terms and + conditions: + + ------------ + PCRE LICENCE + ------------ + PCRE is a library of functions to support regular expressions whose syntax + and semantics are as close as possible to those of the Perl 5 language. + Release 8 of PCRE is distributed under the terms of the "BSD" licence, as + specified below. The documentation for PCRE, supplied in the "doc" + directory, is distributed under the same terms as the software itself. The + basic library functions are written in C and are freestanding. Also + included in the distribution is a set of C++ wrapper functions, and a just- + in-time compiler that can be used to optimize pattern matching. These are + both optional features that can be omitted when the library is built. + + THE BASIC LIBRARY FUNCTIONS + --------------------------- + Written by: Philip Hazel + Email local part: ph10 + Email domain: cam.ac.uk + University of Cambridge Computing Service, + Cambridge, England. + Copyright (c) 1997-2012 University of Cambridge + All rights reserved. + + PCRE JUST-IN-TIME COMPILATION SUPPORT + ------------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Emain domain: freemail.hu + Copyright(c) 2010-2012 Zoltan Herczeg + All rights reserved. + + STACK-LESS JUST-IN-TIME COMPILER + -------------------------------- + Written by: Zoltan Herczeg + Email local part: hzmester + Emain domain: freemail.hu + Copyright(c) 2009-2012 Zoltan Herczeg + All rights reserved. + + THE C++ WRAPPER FUNCTIONS + ------------------------- + Contributed by: Google Inc. + Copyright (c) 2007-2012, Google Inc. + All rights reserved. + + THE "BSD" LICENCE + ----------------- + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the name of Google + Inc. nor the names of their contributors may be used to endorse or + promote products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + 6. Some of the cuBLAS library routines were written by or + derived from code written by Vasily Volkov and are subject + to the Modified Berkeley Software Distribution License as + follows: + + Copyright (c) 2007-2009, Regents of the University of California + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the University of California, Berkeley nor + the names of its contributors may be used to endorse or promote + products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + 7. Some of the cuBLAS library routines were written by or + derived from code written by Davide Barbieri and are + subject to the Modified Berkeley Software Distribution + License as follows: + + Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * The name of the author may not be used to endorse or promote + products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + 8. Some of the cuBLAS library routines were derived from + code developed by the University of Tennessee and are + subject to the Modified Berkeley Software Distribution + License as follows: + + Copyright (c) 2010 The University of Tennessee. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer listed in this license in the documentation and/or + other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 9. Some of the cuBLAS library routines were written by or + derived from code written by Jonathan Hogg and are subject + to the Modified Berkeley Software Distribution License as + follows: + + Copyright (c) 2012, The Science and Technology Facilities Council (STFC). + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the STFC nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 10. Some of the cuBLAS library routines were written by or + derived from code written by Ahmad M. Abdelfattah, David + Keyes, and Hatem Ltaief, and are subject to the Apache + License, Version 2.0, as follows: + + -- (C) Copyright 2013 King Abdullah University of Science and Technology + Authors: + Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa) + David Keyes (david.keyes@kaust.edu.sa) + Hatem Ltaief (hatem.ltaief@kaust.edu.sa) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the King Abdullah University of Science and + Technology nor the names of its contributors may be used to endorse + or promote products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE + + 11. Some of the cuSPARSE library routines were written by or + derived from code written by Li-Wen Chang and are subject + to the NCSA Open Source License as follows: + + Copyright (c) 2012, University of Illinois. + + All rights reserved. + + Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal with the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimers in the documentation and/or other materials provided + with the distribution. + * Neither the names of IMPACT Group, University of Illinois, nor + the names of its contributors may be used to endorse or promote + products derived from this Software without specific prior + written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + 12. Some of the cuRAND library routines were written by or + derived from code written by Mutsuo Saito and Makoto + Matsumoto and are subject to the following license: + + Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima + University. All rights reserved. + + Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima + University and University of Tokyo. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the Hiroshima University nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 13. Some of the cuRAND library routines were derived from + code developed by D. E. Shaw Research and are subject to + the following license: + + Copyright 2010-2011, D. E. Shaw Research. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions, and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 14. Some of the Math library routines were written by or + derived from code developed by Norbert Juffa and are + subject to the following license: + + Copyright (c) 2015-2017, Norbert Juffa + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 15. Licensee's use of the lz4 third party component is + subject to the following terms and conditions: + + Copyright (C) 2011-2013, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + 16. The NPP library uses code from the Boost Math Toolkit, + and is subject to the following license: + + Boost Software License - Version 1.0 - August 17th, 2003 + . . . . + + Permission is hereby granted, free of charge, to any person or + organization obtaining a copy of the software and accompanying + documentation covered by this license (the "Software") to use, + reproduce, display, distribute, execute, and transmit the Software, + and to prepare derivative works of the Software, and to permit + third-parties to whom the Software is furnished to do so, all + subject to the following: + + The copyright notices in the Software and this entire statement, + including the above license grant, this restriction and the following + disclaimer, must be included in all copies of the Software, in whole + or in part, and all derivative works of the Software, unless such + copies or derivative works are solely in the form of machine-executable + object code generated by a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND + NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR + ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR + OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + 17. Portions of the Nsight Eclipse Edition is subject to the + following license: + + The Eclipse Foundation makes available all content in this plug-in + ("Content"). Unless otherwise indicated below, the Content is provided + to you under the terms and conditions of the Eclipse Public License + Version 1.0 ("EPL"). A copy of the EPL is available at http:// + www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program" + will mean the Content. + + If you did not receive this Content directly from the Eclipse + Foundation, the Content is being redistributed by another party + ("Redistributor") and different terms and conditions may apply to your + use of any object code in the Content. Check the Redistributor's + license that was provided with the Content. If no such license exists, + contact the Redistributor. Unless otherwise indicated below, the terms + and conditions of the EPL still apply to any source code in the + Content and such source code may be obtained at http://www.eclipse.org. + + 18. Some of the cuBLAS library routines uses code from + OpenAI, which is subject to the following license: + + License URL + https://github.com/openai/openai-gemm/blob/master/LICENSE + + License Text + The MIT License + + Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + 19. Licensee's use of the Visual Studio Setup Configuration + Samples is subject to the following license: + + The MIT License (MIT) + Copyright (C) Microsoft Corporation. All rights reserved. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + 20. Licensee's use of linmath.h header for CPU functions for + GL vector/matrix operations from lunarG is subject to the + Apache License Version 2.0. + + 21. The DX12-CUDA sample uses the d3dx12.h header, which is + subject to the MIT license . + +----------------- +``` + +### URLs + - `Homepage`: https://developer.nvidia.com/cuda-zone + + +## nvidia-nvtx (13.0.39) ### Licenses License: `Apache 2.0` - - `License.txt`: + - `licenses/License.txt`: ``` Apache License Version 2.0, January 2004 @@ -27719,7 +37241,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ### Licenses License: `Apache License v2.0` - - `licenses/LICENSE`: + - `LICENSE`: ``` Apache License @@ -27899,18 +37421,7 @@ License: `Apache License v2.0` END OF TERMS AND CONDITIONS - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] + Copyright 2021 NVIDIA Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27930,7 +37441,7 @@ License: `Apache License v2.0` - `Repository`: https://github.com/onnx/onnx -## onnx-graphsurgeon (0.5.8) +## onnx_graphsurgeon (0.5.8) ### Licenses License: `Apache 2.0` @@ -28134,7 +37645,7 @@ License: `Apache 2.0` - `Homepage`: https://github.com/NVIDIA/TensorRT/tree/main/tools/onnx-graphsurgeon -## openai (2.3.0) +## openai (2.8.1) ### Licenses License: `Apache-2.0` @@ -31688,6 +41199,1733 @@ SOFTWARE. - `Homepage`: https://github.com/opencv/opencv-python +## opentelemetry-api (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/opentelemetry-api + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-exporter-otlp (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/exporter/opentelemetry-exporter-otlp + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-exporter-otlp-proto-common (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/exporter/opentelemetry-exporter-otlp-proto-common + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-exporter-otlp-proto-grpc (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/exporter/opentelemetry-exporter-otlp-proto-grpc + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-exporter-otlp-proto-http (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/exporter/opentelemetry-exporter-otlp-proto-http + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-proto (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/opentelemetry-proto + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-sdk (1.38.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/opentelemetry-sdk + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-semantic-conventions (0.59b0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/open-telemetry/opentelemetry-python/tree/main/opentelemetry-semantic-conventions + - `Repository`: https://github.com/open-telemetry/opentelemetry-python + + +## opentelemetry-semantic-conventions-ai (0.4.13) + +### Licenses +License: `Apache-2.0` + + + ## optimum (2.0.0) ### Licenses @@ -31902,6 +43140,99 @@ License: `Apache` - `Homepage`: https://github.com/huggingface/optimum +## optuna (3.6.1) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2018 Preferred Networks, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +== + +Optuna contains code that is licensed by third-party developers. + +== +SciPy + + +The Optuna contains the codes from SciPy project. + + +Copyright (c) 2001-2002 Enthought, Inc. 2003-2022, SciPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +== + +fdlibm + + Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + + Developed at SunPro, a Sun Microsystems, Inc. business. + Permission to use, copy, modify, and distribute this + software is freely granted, provided that this notice + is preserved. + +``` + +### URLs + - `bugtracker`: https://github.com/optuna/optuna/issues + - `documentation`: https://optuna.readthedocs.io + - `homepage`: https://optuna.org/ + - `repository`: https://github.com/optuna/optuna + + ## ordered-set (4.1.0) ### Licenses @@ -31934,6 +43265,283 @@ DEALINGS IN THE SOFTWARE. - `Home`: https://github.com/rspeer/ordered-set +## orjson (3.11.4) + +### Licenses +License: `Apache-2.0 OR MIT` + + - `licenses/LICENSE-MIT`: +``` +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +``` + + - `licenses/LICENSE-APACHE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +``` + +### URLs + - `changelog`: https://github.com/ijl/orjson/blob/master/CHANGELOG.md + - `documentation`: https://github.com/ijl/orjson + - `source`: https://github.com/ijl/orjson + + +## oyaml (1.0) + +### Licenses +License: `MIT` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2018 wim glenn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/wimglenn/oyaml + + ## packaging (25.0) ### Licenses @@ -33423,11 +45031,291 @@ third-party archives. - `repository`: https://github.com/pandas-dev/pandas +## parameterized (0.9.0) + +### Licenses +License: `FreeBSD` + + - `LICENSE.txt`: +``` +Unless stated otherwise in the source files, all code is copyright 2010 David +Wolever . All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY DAVID WOLEVER ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL DAVID WOLEVER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of David Wolever. +``` + +### URLs + - `Homepage`: https://github.com/wolever/parameterized + + +## partial-json-parser (0.2.1.1.post7) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2025 Promplate + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `homepage`: https://promplate.dev/partial-json-parser + - `repository`: https://github.com/promplate/partial-json-parser + + ## patchelf (0.17.2.4) ### Licenses License: `GPL-3.0-or-later` + - `licenses/LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + - `licenses/COPYING`: ``` GNU GENERAL PUBLIC LICENSE @@ -34106,223 +45994,538 @@ Public License instead of this License. But first, please read . ``` - - `licenses/LICENSE`: -``` - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -``` - ### URLs - `Bug Tracker`: https://github.com/mayeut/patchelf-pypi/issues - `Homepage`: https://github.com/NixOS/patchelf - `Source Code`: https://github.com/mayeut/patchelf-pypi -## peft (0.17.1) +## pathspec (0.12.1) + +### Licenses +License: `Mozilla Public License 2.0 (MPL 2.0)` + + - `LICENSE`: +``` +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. +``` + +### URLs + - `Documentation`: https://python-path-specification.readthedocs.io/en/latest/index.html + - `Issue Tracker`: https://github.com/cpburnz/python-pathspec/issues + - `Source Code`: https://github.com/cpburnz/python-pathspec + + +## pathvalidate (3.3.1) + +### Licenses +License: `MIT License` + + - `licenses/LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2016-2025 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/pathvalidate/blob/master/CHANGELOG.md + - `Documentation`: https://pathvalidate.rtfd.io/ + - `Homepage`: https://github.com/thombashi/pathvalidate + - `Source`: https://github.com/thombashi/pathvalidate + - `Tracker`: https://github.com/thombashi/pathvalidate/issues + + +## patsy (1.0.2) + +### Licenses +License: `2-clause BSD` + + - `licenses/LICENSE.txt`: +``` +The bulk of Patsy is distributed under a simple 2-clause BSD license: + + Copyright (C) 2011-2012, Patsy Developers. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The module patsy.compat contains code derived from the Python +standard library, and is covered by the following license: + + PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 + -------------------------------------------- + + 1. This LICENSE AGREEMENT is between the Python Software Foundation + ("PSF"), and the Individual or Organization ("Licensee") accessing and + otherwise using this software ("Python") in source or binary form and + its associated documentation. + + 2. Subject to the terms and conditions of this License Agreement, PSF hereby + grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, + analyze, test, perform and/or display publicly, prepare derivative works, + distribute, and otherwise use Python alone or in any derivative version, + provided, however, that PSF's License Agreement and PSF's notice of copyright, + i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, + 2011, 2012 Python Software Foundation; All Rights Reserved" are retained in Python + alone or in any derivative version prepared by Licensee. + + 3. In the event Licensee prepares a derivative work that is based on + or incorporates Python or any part thereof, and wants to make + the derivative work available to others as provided herein, then + Licensee hereby agrees to include in any such work a brief summary of + the changes made to Python. + + 4. PSF is making Python available to Licensee on an "AS IS" + basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR + IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND + DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS + FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT + INFRINGE ANY THIRD PARTY RIGHTS. + + 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON + FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS + A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, + OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + + 6. This License Agreement will automatically terminate upon a material + breach of its terms and conditions. + + 7. Nothing in this License Agreement shall be deemed to create any + relationship of agency, partnership, or joint venture between PSF and + Licensee. This License Agreement does not grant permission to use PSF + trademarks or trade name in a trademark sense to endorse or promote + products or services of Licensee, or any third party. + + 8. By copying, installing or otherwise using Python, Licensee + agrees to be bound by the terms and conditions of this License + Agreement. + +As per item (3), we are required to provide a brief summary of +changes. For this, see comments in patsy/compat.py. +``` + +### URLs + - `Homepage`: https://github.com/pydata/patsy + + +## peft (0.18.0) ### Licenses License: `Apache` - - `licenses/LICENSE`: + - `LICENSE`: ``` Apache License Version 2.0, January 2004 @@ -34531,12 +46734,47 @@ License: `Apache` - `Homepage`: https://github.com/huggingface/peft -## pillow (10.3.0) +## perf-analyzer (2.59.1) ### Licenses -License: `HPND` +License: `None` - - `LICENSE`: + - `licenses/LICENSE`: +``` +Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.``` + + + +## pillow (12.0.0) + +### Licenses +License: `MIT-CMU` + + - `licenses/LICENSE`: ``` The Python Imaging Library (PIL) is @@ -34545,9 +46783,9 @@ The Python Imaging Library (PIL) is Pillow is the friendly PIL fork. It is - Copyright © 2010-2024 by Jeffrey A. Clark and contributors + Copyright © 2010 by Jeffrey A. Clark and contributors -Like PIL, Pillow is licensed under the open source HPND License: +Like PIL, Pillow is licensed under the open source MIT-CMU License: By obtaining, using, and/or copying this software and/or its associated documentation, you agree that you have read, understood, and will comply @@ -34570,6 +46808,38 @@ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +---- + +AOM + +Copyright (c) 2016, Alliance for Open Media. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + ---- BROTLI @@ -34643,6 +46913,35 @@ bzip2/libbzip2 version 1.0.8 of 13 July 2019 -------------------------------------------------------------------------- +---- + +DAV1D + +Copyright © 2018-2019, VideoLAN and dav1d authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---- FREETYPE2 @@ -34860,351 +47159,6 @@ Legal Terms --- end of FTL.TXT --- --------------------------------------------------------------------------- - - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc. - 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Library General -Public License instead of this License. - --------------------------------------------------------------------------- - The following license details are part of `src/bdf/README`: ``` @@ -35363,6 +47317,399 @@ The above copyright notice and this permission notice shall be included in all c THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---- + +LIBAVIF + +Copyright 2019 Joe Drago. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------ + +Files: src/obu.c + +Copyright © 2018-2019, VideoLAN and dav1d authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------ + +Files: third_party/iccjpeg/* + +In plain English: + +1. We don't promise that this software works. (But if you find any bugs, + please let us know!) +2. You can use this software for whatever you want. You don't have to pay us. +3. You may not pretend that you wrote this software. If you use it in a + program, you must acknowledge somewhere in your documentation that + you've used the IJG code. + +In legalese: + +The authors make NO WARRANTY or representation, either express or implied, +with respect to this software, its quality, accuracy, merchantability, or +fitness for a particular purpose. This software is provided "AS IS", and you, +its user, assume the entire risk as to its quality and accuracy. + +This software is copyright (C) 1991-2013, Thomas G. Lane, Guido Vollbeding. +All Rights Reserved except as specified below. + +Permission is hereby granted to use, copy, modify, and distribute this +software (or portions thereof) for any purpose, without fee, subject to these +conditions: +(1) If any part of the source code for this software is distributed, then this +README file must be included, with this copyright and no-warranty notice +unaltered; and any additions, deletions, or changes to the original files +must be clearly indicated in accompanying documentation. +(2) If only executable code is distributed, then the accompanying +documentation must state that "this software is based in part on the work of +the Independent JPEG Group". +(3) Permission for use of this software is granted only if the user accepts +full responsibility for any undesirable consequences; the authors accept +NO LIABILITY for damages of any kind. + +These conditions apply to any software derived from or based on the IJG code, +not just to the unmodified library. If you use our work, you ought to +acknowledge us. + +Permission is NOT granted for the use of any IJG author's name or company name +in advertising or publicity relating to this software or products derived from +it. This software may be referred to only as "the Independent JPEG Group's +software". + +We specifically permit and encourage the use of this software as the basis of +commercial products, provided that all warranty or liability claims are +assumed by the product vendor. + + +The Unix configuration script "configure" was produced with GNU Autoconf. +It is copyright by the Free Software Foundation but is freely distributable. +The same holds for its supporting scripts (config.guess, config.sub, +ltmain.sh). Another support script, install-sh, is copyright by X Consortium +but is also freely distributable. + +The IJG distribution formerly included code to read and write GIF files. +To avoid entanglement with the Unisys LZW patent, GIF reading support has +been removed altogether, and the GIF writer has been simplified to produce +"uncompressed GIFs". This technique does not use the LZW algorithm; the +resulting GIF files are larger than usual, but are readable by all standard +GIF decoders. + +We are required to state that + "The Graphics Interchange Format(c) is the Copyright property of + CompuServe Incorporated. GIF(sm) is a Service Mark property of + CompuServe Incorporated." + +------------------------------------------------------------------------------ + +Files: contrib/gdk-pixbuf/* + +Copyright 2020 Emmanuel Gil Peyrot. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------ + +Files: android_jni/gradlew* + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------------------------------------------------------------------------------ + +Files: third_party/libyuv/* + +Copyright 2011 The LibYuv Project Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---- LIBJPEG @@ -35683,6 +48030,41 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +---- + +LIBYUV + +Copyright 2011 The LibYuv Project Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---- OPENJPEG @@ -35881,19 +48263,129 @@ Gailly and Mark Adler; it does not include third-party code. If you redistribute modified sources, we would appreciate that you include in the file ChangeLog history information documenting your changes. Please read the FAQ for more information on the distribution of modified source versions. + + +---- + +ZSTD + +BSD License + +For Zstandard software + +Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook, nor Meta, nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` ### URLs - - `Changelog`: https://github.com/python-pillow/Pillow/blob/main/CHANGES.rst + - `Changelog`: https://github.com/python-pillow/Pillow/releases - `Documentation`: https://pillow.readthedocs.io - `Funding`: https://tidelift.com/subscription/pkg/pypi-pillow?utm_source=pypi-pillow&utm_medium=pypi - - `Homepage`: https://python-pillow.org + - `Homepage`: https://python-pillow.github.io - `Mastodon`: https://fosstodon.org/@pillow - `Release notes`: https://pillow.readthedocs.io/en/stable/releasenotes/index.html - `Source`: https://github.com/python-pillow/Pillow -## plotly (6.3.1) +## pip (24.0) + +### Licenses +License: `MIT` + + - `LICENSE.txt`: +``` +Copyright (c) 2008-present The pip developers (see AUTHORS.txt file) + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Changelog`: https://pip.pypa.io/en/stable/news/ + - `Documentation`: https://pip.pypa.io + - `Homepage`: https://pip.pypa.io/ + - `Source`: https://github.com/pypa/pip + + +## platformdirs (4.5.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2010-202x The platformdirs developers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/tox-dev/platformdirs/releases + - `Documentation`: https://platformdirs.readthedocs.io + - `Homepage`: https://github.com/tox-dev/platformdirs + - `Source`: https://github.com/tox-dev/platformdirs + - `Tracker`: https://github.com/tox-dev/platformdirs/issues + + +## plotly (6.5.0) ### Licenses License: `MIT License` @@ -35930,6 +48422,38 @@ THE SOFTWARE. - `HomePage`: https://plotly.com/python/ +## pluggy (1.6.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2015 holger krekel (rather uses bitbucket/hpk42) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + + + ## polygraphy (0.49.26) ### Licenses @@ -36134,7 +48658,65 @@ License: `Apache 2.0` - `Homepage`: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy -## prometheus-client (0.23.1) +## portalocker (3.2.0) + +### Licenses +License: `BSD-3-Clause` + + - `licenses/LICENSE`: +``` +Copyright 2022 Rick van Hattem + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `bugs`: https://github.com/wolph/portalocker/issues + - `documentation`: https://portalocker.readthedocs.io/en/latest/ + - `repository`: https://github.com/wolph/portalocker/ + + +## pre_commit (4.5.0) + +### Licenses +License: `MIT` + + - `LICENSE`: +``` +Copyright (c) 2014 pre-commit dev team: Anthony Sottile, Ken Struys + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/pre-commit/pre-commit + + +## prometheus_client (0.23.1) ### Licenses License: `Apache-2.0 AND BSD-2-Clause` @@ -36601,7 +49183,7 @@ License: `Apache-2.0` - `Homepage`: https://github.com/aio-libs/propcache -## protobuf (6.33.0) +## protobuf (6.33.1) ### Licenses License: `3-Clause BSD License` @@ -36646,7 +49228,7 @@ support library is itself covered by the above license. - `Homepage`: https://developers.google.com/protocol-buffers/ -## psutil (7.1.0) +## psutil (7.1.3) ### Licenses License: `BSD-3-Clause` @@ -36688,7 +49270,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Homepage`: https://github.com/giampaolo/psutil -## pulp (3.3.0) +## PuLP (3.3.0) ### Licenses License: `MIT` @@ -36724,12 +49306,66 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - `source`: https://github.com/coin-or/pulp -## pyarrow (21.0.0) +## py (1.11.0) + +### Licenses +License: `MIT license` + + - `_vendored_packages/apipkg-2.0.0.dist-info/LICENSE`: +``` + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +``` + + - `_vendored_packages/iniconfig-1.1.1.dist-info/LICENSE`: +``` + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +``` + +### URLs + - `Homepage`: https://py.readthedocs.io/ + + +## pyarrow (22.0.0) ### Licenses License: `Apache Software License` - - `LICENSE.txt`: + - `licenses/LICENSE.txt`: ``` Apache License @@ -39065,6 +51701,111 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - `Repository`: https://github.com/apache/arrow +## pybind11 (3.0.1) + +### Licenses +License: `BSD-3-Clause` + + - `licenses/LICENSE`: +``` +Copyright (c) 2019 Sergei Izmailov , All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +You are under no obligation whatsoever to provide any bug fixes, patches, or +upgrades to the features, functionality or performance of the source code +("Enhancements") to anyone; however, if you choose to make your Enhancements +available either publicly, or directly to the author of this software, without +imposing a separate written license agreement for such Enhancements, then you +hereby grant the following license: a non-exclusive, royalty-free perpetual +license to install, use, modify, prepare derivative works, incorporate into +other computer software, distribute, and sublicense such enhancements or +derivative works thereof, in binary and source code form. +``` + +### URLs + - `Bug Tracker`: https://github.com/pybind/pybind11/issues + - `Changelog`: https://pybind11.readthedocs.io/en/latest/changelog.html + - `Chat`: https://gitter.im/pybind/Lobby + - `Discussions`: https://github.com/pybind/pybind11/discussions + - `Documentation`: https://pybind11.readthedocs.io/ + - `Homepage`: https://github.com/pybind/pybind11 + + +## pybind11-stubgen (2.5.5) + +### Licenses +License: `BSD` + + - `licenses/LICENSE`: +``` +Copyright (c) 2019 Sergei Izmailov , All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +You are under no obligation whatsoever to provide any bug fixes, patches, or +upgrades to the features, functionality or performance of the source code +("Enhancements") to anyone; however, if you choose to make your Enhancements +available either publicly, or directly to the author of this software, without +imposing a separate written license agreement for such Enhancements, then you +hereby grant the following license: a non-exclusive, royalty-free perpetual +license to install, use, modify, prepare derivative works, incorporate into +other computer software, distribute, and sublicense such enhancements or +derivative works thereof, in binary and source code form. +``` + +### URLs + - `Homepage`: https://github.com/sizmailov/pybind11-stubgen + + ## pycparser (2.23) ### Licenses @@ -39080,24 +51821,24 @@ All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -* Redistributions of source code must retain the above copyright notice, this +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -* Neither the name of the copyright holder nor the names of its contributors may - be used to endorse or promote products derived from this software without +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` @@ -39181,7 +51922,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source`: https://github.com/Legrandin/pycryptodome/ -## pydantic (2.12.2) +## pydantic (2.11.10) ### Licenses License: `MIT` @@ -39190,7 +51931,7 @@ License: `MIT` ``` The MIT License (MIT) -Copyright (c) 2017 to present Pydantic Services Inc. and individual contributors. +Copyright (c) 2022 Samuel Colvin and other contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -39219,7 +51960,7 @@ SOFTWARE. - `Source`: https://github.com/pydantic/pydantic -## pydantic-core (2.41.4) +## pydantic_core (2.33.2) ### Licenses License: `MIT` @@ -39255,7 +51996,7 @@ SOFTWARE. - `Source`: https://github.com/pydantic/pydantic-core -## pydantic-settings (2.11.0) +## pydantic-settings (2.12.0) ### Licenses License: `MIT` @@ -39293,7 +52034,7 @@ SOFTWARE. - `Source`: https://github.com/pydantic/pydantic-settings -## pygments (2.19.2) +## Pygments (2.19.2) ### Licenses License: `BSD-2-Clause` @@ -39407,7 +52148,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - `Homepage`: https://github.com/pyparsing/pyparsing/ -## pyproject-hooks (1.2.0) +## pyproject_hooks (1.2.0) ### Licenses License: `MIT License` @@ -39443,6 +52184,1459 @@ THE SOFTWARE. - `Source`: https://github.com/pypa/pyproject-hooks +## pytablewriter (1.2.1) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +The MIT License (MIT) + +Copyright (c) 2016-2025 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/pytablewriter/blob/master/CHANGELOG.md + - `Documentation`: https://pytablewriter.rtfd.io/ + - `Funding`: https://github.com/sponsors/thombashi + - `Homepage`: https://github.com/thombashi/pytablewriter + - `Source`: https://github.com/thombashi/pytablewriter + - `Tracker`: https://github.com/thombashi/pytablewriter/issues + + +## pytest (8.4.2) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Changelog`: https://docs.pytest.org/en/stable/changelog.html + - `Contact`: https://docs.pytest.org/en/stable/contact.html + - `Funding`: https://docs.pytest.org/en/stable/sponsor.html + - `Homepage`: https://docs.pytest.org/en/latest/ + - `Source`: https://github.com/pytest-dev/pytest + - `Tracker`: https://github.com/pytest-dev/pytest/issues + + +## pytest-asyncio (1.3.0) + +### Licenses +License: `Apache-2.0` + + - `licenses/LICENSE`: +``` +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Bug Tracker`: https://github.com/pytest-dev/pytest-asyncio/issues + - `Changelog`: https://pytest-asyncio.readthedocs.io/en/latest/reference/changelog.html + - `Documentation`: https://pytest-asyncio.readthedocs.io + - `Homepage`: https://github.com/pytest-dev/pytest-asyncio + - `Source Code`: https://github.com/pytest-dev/pytest-asyncio + + +## pytest-cov (7.0.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +The MIT License + +Copyright (c) 2010 Meme Dough + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `Changelog`: https://pytest-cov.readthedocs.io/en/latest/changelog.html + - `Documentation`: https://pytest-cov.readthedocs.io/ + - `Issue Tracker`: https://github.com/pytest-dev/pytest-cov/issues + - `Sources`: https://github.com/pytest-dev/pytest-cov + + +## pytest-csv (3.0.0) + +### Licenses +License: `GPLv3` + + - `COPYING`: +``` + + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. + +``` + +### URLs + - `Homepage`: https://github.com/nicoulaj/pytest-csv + + +## pytest-env (1.2.0) + +### Licenses +License: `MIT License` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2010-202x The pytest-env developers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/pytest-dev/pytest-env + - `Source`: https://github.com/pytest-dev/pytest-env + - `Tracker`: https://github.com/pytest-dev/pytest-env/issues + + +## pytest-forked (1.6.0) + +### Licenses +License: `MIT` + + - `LICENSE`: +``` + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/pytest-dev/pytest-forked + + +## pytest-mock (3.15.1) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) [2016] [Bruno Oliveira] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://pytest-mock.readthedocs.io/en/latest/changelog.html + - `Documentation`: https://pytest-mock.readthedocs.io/en/latest/ + - `Homepage`: https://github.com/pytest-dev/pytest-mock/ + - `Source`: https://github.com/pytest-dev/pytest-mock/ + - `Tracker`: https://github.com/pytest-dev/pytest-mock/issues + + +## pytest-rerunfailures (16.1) + +### Licenses +License: `MPL-2.0` + + - `licenses/LICENSE`: +``` +This Source Code Form is subject to the terms of the Mozilla Public +License, v. 2.0. If a copy of the MPL was not distributed with this +file, You can obtain one at https://www.mozilla.org/MPL/2.0/. +``` + +### URLs + - `Homepage`: https://github.com/pytest-dev/pytest-rerunfailures + + +## pytest-split (0.10.0) + +### Licenses +License: `MIT` + + - `LICENSE`: +``` +Copyright (c) 2024 Jerry Pussinen + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Documentation`: https://jerry-git.github.io/pytest-split + - `Homepage`: https://jerry-git.github.io/pytest-split + - `Repository`: https://github.com/jerry-git/pytest-split + + +## pytest-threadleak (0.5.0) + +### Licenses +License: `MIT` + + - `LICENSES/MIT.txt`: +``` +MIT License + +Copyright (c) 2017 Nir Soffer + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/nirs/pytest-threadleak + + +## pytest-timeout (2.4.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +The MIT License + +Copyright (C) 2012, 2014 Floris Bruynooghe + + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/pytest-dev/pytest-timeout + + +## pytest-xdist (3.8.0) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2010 Holger Krekel and contributors. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://pytest-xdist.readthedocs.io/en/latest/changelog.html + - `Documentation`: https://pytest-xdist.readthedocs.io/en/latest + - `Homepage`: https://github.com/pytest-dev/pytest-xdist + - `Source`: https://github.com/pytest-dev/pytest-xdist + - `Tracker`: https://github.com/pytest-dev/pytest-xdist/issues + + ## python-dateutil (2.9.0.post0) ### Licenses @@ -39511,7 +53705,7 @@ The above BSD License Applies to all code, even that also covered by Apache 2.0. - `Source`: https://github.com/dateutil/dateutil -## python-dotenv (1.1.1) +## python-dotenv (1.2.1) ### Licenses License: `BSD-3-Clause` @@ -39548,7 +53742,44 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` ### URLs - - `Homepage`: https://github.com/theskumar/python-dotenv + - `Source`: https://github.com/theskumar/python-dotenv + + +## python-rapidjson (1.22) + +### Licenses +License: `MIT License` + + - `licenses/LICENSE`: +``` +python-rapidjson is licensed under the MIT license. + +The MIT License (MIT) + +Copyright (c) 2015, 2016, 2017 Ken Robbins +Copyright (c) 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024 Lele Gaifax + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/python-rapidjson/python-rapidjson ## pytz (2025.2) @@ -39584,7 +53815,7 @@ DEALINGS IN THE SOFTWARE. - `Homepage`: http://pythonhosted.org/pytz -## pyyaml (6.0.3) +## PyYAML (6.0.3) ### Licenses License: `MIT` @@ -39662,28 +53893,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` - - `licenses/licenses/LICENSE.libsodium.txt`: -``` -/* - * ISC License - * - * Copyright (c) 2013-2024 - * Frank Denis - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -``` - - `licenses/licenses/LICENSE.zeromq.txt`: ``` Mozilla Public License Version 2.0 @@ -40267,6 +54476,28 @@ Exhibit B - "Incompatible With Secondary Licenses" Notice limitations under the License. ``` + - `licenses/licenses/LICENSE.libsodium.txt`: +``` +/* + * ISC License + * + * Copyright (c) 2013-2024 + * Frank Denis + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +``` + ### URLs - `Documentation`: https://pyzmq.readthedocs.org - `Homepage`: https://pyzmq.readthedocs.org @@ -40312,7 +54543,7 @@ THE SOFTWARE. - `Tidelift`: https://tidelift.com/subscription/pkg/pypi-referencing?utm_source=pypi-referencing&utm_medium=referral&utm_campaign=pypi-link -## regex (2025.9.18) +## regex (2025.11.3) ### Licenses License: `Apache-2.0 AND CNRI-Python` @@ -40723,6 +54954,224 @@ License: `Apache-2.0` - `Source`: https://github.com/psf/requests +## responses (0.25.8) + +### Licenses +License: `Apache 2.0` + + - `LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2015 David Cramer + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +``` + +### URLs + - `Bug Tracker`: https://github.com/getsentry/responses/issues + - `Changes`: https://github.com/getsentry/responses/blob/master/CHANGES + - `Documentation`: https://github.com/getsentry/responses/blob/master/README.rst + - `Homepage`: https://github.com/getsentry/responses + - `Source Code`: https://github.com/getsentry/responses + + ## rich (14.2.0) ### Licenses @@ -40756,7 +55205,437 @@ SOFTWARE. - `Homepage`: https://github.com/Textualize/rich -## rpds-py (0.27.1) +## rouge (1.0.1) + +### Licenses +License: `LICENCE.txt` + + - `LICENSE`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Download`: https://github.com/pltrdy/rouge/archive/1.0.1.tar.gz + - `Homepage`: http://github.com/pltrdy/rouge + + +## rouge_score (0.1.2) + +### Licenses +License: `Apache Software License` + + - `LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://github.com/google-research/google-research/tree/master/rouge + + +## rpds-py (0.29.0) ### Licenses License: `MIT` @@ -40794,7 +55673,1635 @@ THE SOFTWARE. - `Upstream`: https://github.com/orium/rpds -## safetensors (0.6.2) +## ruff (0.9.4) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2022 Charles Marsh + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +end of terms and conditions + +The externally maintained libraries from which parts of the Software is derived +are: + +- flake8-comprehensions, licensed as follows: + """ + MIT License + + Copyright (c) 2017 Adam Johnson + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-no-pep420, licensed as follows: + """ + MIT License + + Copyright (c) 2020 Adam Johnson + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-tidy-imports, licensed as follows: + """ + MIT License + + Copyright (c) 2017 Adam Johnson + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-return, licensed as follows: + """ + MIT License + + Copyright (c) 2019 Afonasev Evgeniy + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-2020, licensed as follows: + """ + Copyright (c) 2019 Anthony Sottile + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- pyupgrade, licensed as follows: + """ + Copyright (c) 2017 Anthony Sottile + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- flake8-blind-except, licensed as follows: + """ + The MIT License (MIT) + + Copyright (c) 2014 Elijah Andrews + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + """ + +- flake8-gettext, licensed as follows: + """ + BSD Zero Clause License + + Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + """ + +- flake8-implicit-str-concat, licensed as follows: + """ + The MIT License (MIT) + + Copyright (c) 2019 Dylan Turner + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- flake8-debugger, licensed as follows: + """ + MIT License + + Copyright (c) 2016 Joseph Kahn + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-pyi, licensed as follows: + """ + The MIT License (MIT) + + Copyright (c) 2016 Łukasz Langa + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-print, licensed as follows: + """ + MIT License + + Copyright (c) 2016 Joseph Kahn + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-import-conventions, licensed as follows: + """ + MIT License + + Copyright (c) 2021 João Palmeiro + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-simplify, licensed as follows: + """ + MIT License + + Copyright (c) 2020 Martin Thoma + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-slots, licensed as follows: + """ + Copyright (c) 2021 Dominic Davis-Foster + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE + OR OTHER DEALINGS IN THE SOFTWARE. + """ + +- flake8-todos, licensed as follows: + """ + Copyright (c) 2019 EclecticIQ. All rights reserved. + Copyright (c) 2020 Gram . All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """ + +- flake8-unused-arguments, licensed as follows: + """ + MIT License + + Copyright (c) 2019 Nathan Hoad + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- pygrep-hooks, licensed as follows: + """ + Copyright (c) 2018 Anthony Sottile + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- autoflake, licensed as follows: + """ + Copyright (C) 2012-2018 Steven Myint + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- autotyping, licensed as follows: + """ + MIT License + + Copyright (c) 2023 Jelle Zijlstra + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- Flake8, licensed as follows: + """ + == Flake8 License (MIT) == + + Copyright (C) 2011-2013 Tarek Ziade + Copyright (C) 2012-2016 Ian Cordasco + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-bugbear, licensed as follows: + """ + The MIT License (MIT) + + Copyright (c) 2016 Łukasz Langa + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-commas, licensed as follows: + """ + The MIT License (MIT) + + Copyright (c) 2017 Thomas Grainger. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + + Portions of this flake8-commas Software may utilize the following + copyrighted material, the use of which is hereby acknowledged. + + Original flake8-commas: https://github.com/trevorcreech/flake8-commas/commit/e8563b71b1d5442e102c8734c11cb5202284293d + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- flynt, licensed as follows: + """ + MIT License + + Copyright (c) 2019-2022 Ilya Kamenshchikov + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- isort, licensed as follows: + """ + The MIT License (MIT) + + Copyright (c) 2013 Timothy Edmund Crosley + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- pep8-naming, licensed as follows: + """ + Copyright © 2013 Florent Xicluna + + Licensed under the terms of the Expat License + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- pycodestyle, licensed as follows: + """ + Copyright © 2006-2009 Johann C. Rocholl + Copyright © 2009-2014 Florent Xicluna + Copyright © 2014-2020 Ian Lee + + Licensed under the terms of the Expat License + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- pydocstyle, licensed as follows: + """ + Copyright (c) 2012 GreenSteam, + + Copyright (c) 2014-2020 Amir Rachum, + + Copyright (c) 2020 Sambhav Kothari, + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- Pyflakes, licensed as follows: + """ + Copyright 2005-2011 Divmod, Inc. + Copyright 2013-2014 Florent Xicluna + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + """ + +- flake8-use-pathlib, licensed as follows: + """ + MIT License + + Copyright (c) 2021 Rodolphe Pelloux-Prayer + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- RustPython, licensed as follows: + """ + MIT License + + Copyright (c) 2020 RustPython Team + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-annotations, licensed as follows: + """ + MIT License + + Copyright (c) 2019 - Present S. Co1 + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-async, licensed as follows: + """ + MIT License + + Copyright (c) 2022 Cooper Lees + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-type-checking, licensed as follows: + """ + Copyright (c) 2021, Sondre Lillebø Gundersen + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of pytest-{{ cookiecutter.plugin_name }} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """ + +- flake8-bandit, licensed as follows: + """ + Copyright (c) 2017 Tyler Wince + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- flake8-eradicate, licensed as follows: + """ + MIT License + + Copyright (c) 2018 Nikita Sobolev + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-quotes, licensed as follows: + """ + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + """ + +- flake8-logging-format, licensed as follows: + """ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + +- flake8-raise, licensed as follows: + """ + MIT License + + Copyright (c) 2020 Jon Dufresne + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-self, licensed as follows: + """ + MIT License + + Copyright (c) 2023 Korijn van Golen + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-django, licensed under the GPL license. + +- perflint, licensed as follows: + """ + MIT License + + Copyright (c) 2022 Anthony Shaw + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-logging, licensed as follows: + """ + MIT License + + Copyright (c) 2023 Adam Johnson + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- flake8-trio, licensed as follows: + """ + MIT License + + Copyright (c) 2022 Zac Hatfield-Dodds + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- Pyright, licensed as follows: + """ + MIT License + + Pyright - A static type checker for the Python language + Copyright (c) Microsoft Corporation. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE + """ + +- rust-analyzer/text-size, licensed under the MIT license: + """ + Permission is hereby granted, free of charge, to any + person obtaining a copy of this software and associated + documentation files (the "Software"), to deal in the + Software without restriction, including without + limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software + is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice + shall be included in all copies or substantial portions + of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + """ + +- rome/tools, licensed under the MIT license: + """ + MIT License + + Copyright (c) Rome Tools, Inc. and its affiliates. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + +- pydoclint, licensed as follows: + """ + MIT License + + Copyright (c) 2023 jsh9 + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ +``` + +### URLs + - `Changelog`: https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md + - `Documentation`: https://docs.astral.sh/ruff/ + - `Homepage`: https://docs.astral.sh/ruff + - `Repository`: https://github.com/astral-sh/ruff + + +## sacrebleu (2.5.1) + +### Licenses +License: `Apache Software License` + + - `LICENSE.txt`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Repository`: https://github.com/mjpost/sacrebleu + + +## safetensors (0.7.0) ### Licenses License: `Apache Software License` @@ -41009,174 +57516,52 @@ License: `Apache Software License` - `Source`: https://github.com/huggingface/safetensors -## scipy (1.16.2) +## scikit-learn (1.7.2) ### Licenses -License: `BSD License` +License: `BSD-3-Clause` - - `LICENSE.txt`: + - `licenses/COPYING`: ``` -Copyright (c) 2001-2002 Enthought, Inc. 2003, SciPy Developers. +BSD 3-Clause License + +Copyright (c) 2007-2024 The scikit-learn developers. All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---- -This binary distribution of SciPy can also bundle the following software -(depending on the build): - - -Name: OpenBLAS -Files: scipy.libs/libscipy_openblas*.so -Description: bundled as a dynamically linked library -Availability: https://github.com/OpenMathLib/OpenBLAS/ -License: BSD-3-Clause - Copyright (c) 2011-2014, The OpenBLAS Project - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE - USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Name: LAPACK -Files: scipy.libs/libscipy_openblas*.so -Description: bundled in OpenBLAS -Availability: https://github.com/OpenMathLib/OpenBLAS/ -License: BSD-3-Clause-Open-MPI - Copyright (c) 1992-2013 The University of Tennessee and The University - of Tennessee Research Foundation. All rights - reserved. - Copyright (c) 2000-2013 The University of California Berkeley. All - rights reserved. - Copyright (c) 2006-2013 The University of Colorado Denver. All rights - reserved. - - $COPYRIGHT$ - - Additional copyrights may follow - - $HEADER$ - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer listed - in this license in the documentation and/or other materials - provided with the distribution. - - - Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - The copyright holders provide no reassurances that the source code - provided does not infringe any patent, copyright, or any other - intellectual property rights of third parties. The copyright holders - disclaim any liability to any recipient for claims brought against - recipient by any third party for infringement of that parties - intellectual property rights. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This binary distribution of scikit-learn also bundles the following software: +---- Name: GCC runtime library -Files: scipy.libs/libgfortran*.so -Description: dynamically linked to files compiled with gcc -Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgfortran -License: GPL-3.0-or-later WITH GCC-exception-3.1 - Copyright (C) 2002-2017 Free Software Foundation, Inc. - - Libgfortran is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3, or (at your option) - any later version. - - Libgfortran is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - Under Section 7 of GPL version 3, you are granted additional - permissions described in the GCC Runtime Library Exception, version - 3.1, as published by the Free Software Foundation. - - You should have received a copy of the GNU General Public License and - a copy of the GCC Runtime Library Exception along with this program; - see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - . - ----- - -Full text of license texts referred to above follows (that they are -listed below does not necessarily imply the conditions apply to the -present binary release): - ----- +Files: scikit_learn.libs/libgomp*.so* +Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgomp GCC RUNTIME LIBRARY EXCEPTION @@ -41250,706 +57635,173 @@ consistent with the licensing of the Independent Modules. The availability of this Exception does not imply any general presumption that third-party software is unaffected by the copyleft requirements of the license of GCC. +``` ----- - - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. - - -Name: libquadmath -Files: scipy.libs/libquadmath*.so -Description: dynamically linked to files compiled with gcc -Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libquadmath -License: LGPL-2.1-or-later - - GCC Quad-Precision Math Library - Copyright (C) 2010-2019 Free Software Foundation, Inc. - Written by Francois-Xavier Coudert - - This file is part of the libquadmath library. - Libquadmath is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - Libquadmath is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html +### URLs + - `download`: https://pypi.org/project/scikit-learn/#files + - `homepage`: https://scikit-learn.org + - `release notes`: https://scikit-learn.org/stable/whats_new + - `source`: https://github.com/scikit-learn/scikit-learn + - `tracker`: https://github.com/scikit-learn/scikit-learn/issues + + +## scipy (1.16.3) + +### Licenses +License: `BSD License` + + - `spatial/qhull_src/COPYING_QHULL.txt`: +``` + Qhull, Copyright (c) 1993-2020 + + C.B. Barber + Arlington, MA + + and + + The National Science and Technology Research Center for + Computation and Visualization of Geometric Structures + (The Geometry Center) + University of Minnesota + + email: qhull@qhull.org + +This software includes Qhull from C.B. Barber and The Geometry Center. +Files derived from Qhull 1.0 are copyrighted by the Geometry Center. The +remaining files are copyrighted by C.B. Barber. Qhull is free software +and may be obtained via http from www.qhull.org. It may be freely copied, +modified, and redistributed under the following conditions: + +1. All copyright notices must remain intact in all files. + +2. A copy of this text file must be distributed along with any copies + of Qhull that you redistribute; this includes copies that you have + modified, or copies of programs or other software products that + include Qhull. + +3. If you modify Qhull, you must include a notice giving the + name of the person performing the modification, the date of + modification, and the reason for such modification. + +4. When distributing modified versions of Qhull, or other software + products that include Qhull, you must provide notice that the original + source code may be obtained as noted above. + +5. There is no warranty or other guarantee of fitness for Qhull, it is + provided solely "as is". Bug reports or fixes may be sent to + qhull_bug@qhull.org; the authors may or may not act on them as + they desire. +``` + + - `sparse/linalg/_eigen/arpack/COPYING`: +``` + +BSD Software License + +Pertains to ARPACK and P_ARPACK + +Copyright (c) 1996-2008 Rice University. +Developed by D.C. Sorensen, R.B. Lehoucq, C. Yang, and K. Maschhoff. +All rights reserved. + +Arpack has been renamed to arpack-ng. + +Copyright (c) 2001-2011 - Scilab Enterprises +Updated by Allan Cornet, Sylvestre Ledru. + +Copyright (c) 2010 - Jordi Gutiérrez Hermoso (Octave patch) + +Copyright (c) 2007 - Sébastien Fabbro (gentoo patch) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer listed + in this license in the documentation and/or other materials + provided with the distribution. + +- Neither the name of the copyright holders nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `fft/_pocketfft/LICENSE.md`: +``` +Copyright (C) 2010-2019 Max-Planck-Society +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `_lib/_uarray/LICENSE`: +``` +BSD 3-Clause License + +Copyright (c) 2018, Quansight-Labs +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` ### URLs @@ -42180,7 +58032,7 @@ License: `None` ### Licenses License: `None` - - `licenses/LICENSE`: + - `_vendor/jaraco.context-5.3.0.dist-info/LICENSE`: ``` Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -42201,12 +58053,1440 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ``` + - `_vendor/packaging-24.2.dist-info/LICENSE`: +``` +This software is made available under the terms of *either* of the licenses +found in LICENSE.APACHE or LICENSE.BSD. Contributions to this software is made +under the terms of *both* these licenses. +``` + + - `_vendor/packaging-24.2.dist-info/LICENSE.BSD`: +``` +Copyright (c) Donald Stufft and individual contributors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `_vendor/packaging-24.2.dist-info/LICENSE.APACHE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS +``` + + - `_vendor/zipp-3.19.2.dist-info/LICENSE`: +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +``` + + - `_vendor/wheel-0.45.1.dist-info/LICENSE.txt`: +``` +MIT License + +Copyright (c) 2012 Daniel Holth and contributors + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +``` + + - `_vendor/more_itertools-10.3.0.dist-info/LICENSE`: +``` +Copyright (c) 2012 Erik Rose + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + + - `_vendor/jaraco.functools-4.0.1.dist-info/LICENSE`: +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +``` + + - `_vendor/tomli-2.0.1.dist-info/LICENSE`: +``` +MIT License + +Copyright (c) 2021 Taneli Hukkinen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + + - `_vendor/backports.tarfile-1.2.0.dist-info/LICENSE`: +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +``` + + - `_vendor/jaraco.text-3.12.1.dist-info/LICENSE`: +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +``` + + - `_vendor/importlib_metadata-8.0.0.dist-info/LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + + - `_vendor/typeguard-4.3.0.dist-info/LICENSE`: +``` +This is the MIT license: http://www.opensource.org/licenses/mit-license.php + +Copyright (c) Alex Grönholm + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE +FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +``` + + - `_vendor/inflect-7.3.1.dist-info/LICENSE`: +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +``` + + - `_vendor/autocommand-2.2.2.dist-info/LICENSE`: +``` +GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. + +``` + + - `_vendor/typing_extensions-4.12.2.dist-info/LICENSE`: +``` +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see https://opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. +``` + + - `_vendor/jaraco.collections-5.1.0.dist-info/LICENSE`: +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +``` + + - `_vendor/platformdirs-4.2.2.dist-info/licenses/LICENSE`: +``` +MIT License + +Copyright (c) 2010-202x The platformdirs developers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + + - `_vendor/wheel/vendored/packaging/LICENSE`: +``` +This software is made available under the terms of *either* of the licenses +found in LICENSE.APACHE or LICENSE.BSD. Contributions to this software is made +under the terms of *both* these licenses. +``` + + - `_vendor/wheel/vendored/packaging/LICENSE.BSD`: +``` +Copyright (c) Donald Stufft and individual contributors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `_vendor/wheel/vendored/packaging/LICENSE.APACHE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS +``` + ### URLs - `Changelog`: https://setuptools.pypa.io/en/stable/history.html - `Documentation`: https://setuptools.pypa.io/ - `Source`: https://github.com/pypa/setuptools +## simplejson (3.20.2) + +### Licenses +License: `MIT License` + + - `LICENSE.txt`: +``` +simplejson is dual-licensed software. It is available under the terms +of the MIT license, or the Academic Free License version 2.1. The full +text of each license agreement is included below. This code is also +licensed to the Python Software Foundation (PSF) under a Contributor +Agreement. + +MIT License +=========== + +Copyright (c) 2006 Bob Ippolito + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Academic Free License v. 2.1 +============================ + +Copyright (c) 2006 Bob Ippolito. All rights reserved. + +This Academic Free License (the "License") applies to any original work of authorship (the "Original Work") whose owner (the "Licensor") has placed the following notice immediately following the copyright notice for the Original Work: + +Licensed under the Academic Free License version 2.1 + +1) Grant of Copyright License. Licensor hereby grants You a world-wide, royalty-free, non-exclusive, perpetual, sublicenseable license to do the following: + +a) to reproduce the Original Work in copies; + +b) to prepare derivative works ("Derivative Works") based upon the Original Work; + +c) to distribute copies of the Original Work and Derivative Works to the public; + +d) to perform the Original Work publicly; and + +e) to display the Original Work publicly. + +2) Grant of Patent License. Licensor hereby grants You a world-wide, royalty-free, non-exclusive, perpetual, sublicenseable license, under patent claims owned or controlled by the Licensor that are embodied in the Original Work as furnished by the Licensor, to make, use, sell and offer for sale the Original Work and Derivative Works. + +3) Grant of Source Code License. The term "Source Code" means the preferred form of the Original Work for making modifications to it and all available documentation describing how to modify the Original Work. Licensor hereby agrees to provide a machine-readable copy of the Source Code of the Original Work along with each copy of the Original Work that Licensor distributes. Licensor reserves the right to satisfy this obligation by placing a machine-readable copy of the Source Code in an information repository reasonably calculated to permit inexpensive and convenient access by You for as long as Licensor continues to distribute the Original Work, and by publishing the address of that information repository in a notice immediately following the copyright notice that applies to the Original Work. + +4) Exclusions From License Grant. Neither the names of Licensor, nor the names of any contributors to the Original Work, nor any of their trademarks or service marks, may be used to endorse or promote products derived from this Original Work without express prior written permission of the Licensor. Nothing in this License shall be deemed to grant any rights to trademarks, copyrights, patents, trade secrets or any other intellectual property of Licensor except as expressly stated herein. No patent license is granted to make, use, sell or offer to sell embodiments of any patent claims other than the licensed claims defined in Section 2. No right is granted to the trademarks of Licensor even if such marks are included in the Original Work. Nothing in this License shall be interpreted to prohibit Licensor from licensing under different terms from this License any Original Work that Licensor otherwise would have a right to license. + +5) This section intentionally omitted. + +6) Attribution Rights. You must retain, in the Source Code of any Derivative Works that You create, all copyright, patent or trademark notices from the Source Code of the Original Work, as well as any notices of licensing and any descriptive text identified therein as an "Attribution Notice." You must cause the Source Code for any Derivative Works that You create to carry a prominent Attribution Notice reasonably calculated to inform recipients that You have modified the Original Work. + +7) Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that the copyright in and to the Original Work and the patent rights granted herein by Licensor are owned by the Licensor or are sublicensed to You under the terms of this License with the permission of the contributor(s) of those copyrights and patent rights. Except as expressly stated in the immediately proceeding sentence, the Original Work is provided under this License on an "AS IS" BASIS and WITHOUT WARRANTY, either express or implied, including, without limitation, the warranties of NON-INFRINGEMENT, MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY OF THE ORIGINAL WORK IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an essential part of this License. No license to Original Work is granted hereunder except under this disclaimer. + +8) Limitation of Liability. Under no circumstances and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall the Licensor be liable to any person for any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or the use of the Original Work including, without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses. This limitation of liability shall not apply to liability for death or personal injury resulting from Licensor's negligence to the extent applicable law prohibits such limitation. Some jurisdictions do not allow the exclusion or limitation of incidental or consequential damages, so this exclusion and limitation may not apply to You. + +9) Acceptance and Termination. If You distribute copies of the Original Work or a Derivative Work, You must make a reasonable effort under the circumstances to obtain the express assent of recipients to the terms of this License. Nothing else but this License (or another written agreement between Licensor and You) grants You permission to create Derivative Works based upon the Original Work or to exercise any of the rights granted in Section 1 herein, and any attempt to do so except under the terms of this License (or another written agreement between Licensor and You) is expressly prohibited by U.S. copyright law, the equivalent laws of other countries, and by international treaty. Therefore, by exercising any of the rights granted to You in Section 1 herein, You indicate Your acceptance of this License and all of its terms and conditions. + +10) Termination for Patent Action. This License shall terminate automatically and You may no longer exercise any of the rights granted to You by this License as of the date You commence an action, including a cross-claim or counterclaim, against Licensor or any licensee alleging that the Original Work infringes a patent. This termination provision shall not apply for an action alleging patent infringement by combinations of the Original Work with other software or hardware. + +11) Jurisdiction, Venue and Governing Law. Any action or suit relating to this License may be brought only in the courts of a jurisdiction wherein the Licensor resides or in which Licensor conducts its primary business, and under the laws of that jurisdiction excluding its conflict-of-law provisions. The application of the United Nations Convention on Contracts for the International Sale of Goods is expressly excluded. Any use of the Original Work outside the scope of this License or after its termination shall be subject to the requirements and penalties of the U.S. Copyright Act, 17 U.S.C. § 101 et seq., the equivalent laws of other countries, and international treaty. This section shall survive the termination of this License. + +12) Attorneys Fees. In any action to enforce the terms of this License or seeking damages relating thereto, the prevailing party shall be entitled to recover its costs and expenses, including, without limitation, reasonable attorneys' fees and costs incurred in connection with such action, including any appeal of such action. This section shall survive the termination of this License. + +13) Miscellaneous. This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. + +14) Definition of "You" in This License. "You" throughout this License, whether in upper or lower case, means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with you. For purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +15) Right to Use. You may use the Original Work in all ways not otherwise restricted or conditioned by this License or by law, and Licensor promises not to interfere with or be responsible for such uses by You. + +This license is Copyright (C) 2003-2004 Lawrence E. Rosen. All rights reserved. Permission is hereby granted to copy and distribute this license without modification. This license may not be modified without the express written permission of its copyright owner. +``` + +### URLs + - `Homepage`: https://github.com/simplejson/simplejson + + ## six (1.17.0) ### Licenses @@ -42243,6 +59523,37 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ### Licenses License: `MIT OR Apache-2.0` + - `LICENSE`: +``` +This software is made available under the terms of *either* of the +licenses found in LICENSE.APACHE2 or LICENSE.MIT. Contributions to are +made under the terms of *both* these licenses. +``` + + - `LICENSE.MIT`: +``` +The MIT License (MIT) + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + - `LICENSE.APACHE2`: ``` @@ -42449,37 +59760,6 @@ License: `MIT OR Apache-2.0` limitations under the License. ``` - - `LICENSE`: -``` -This software is made available under the terms of *either* of the -licenses found in LICENSE.APACHE2 or LICENSE.MIT. Contributions to are -made under the terms of *both* these licenses. -``` - - - `LICENSE.MIT`: -``` -The MIT License (MIT) - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -``` - ### URLs - `Changelog`: https://sniffio.readthedocs.io/en/latest/history.html - `Documentation`: https://sniffio.readthedocs.io/ @@ -42528,6 +59808,255 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Homepage`: https://github.com/bastibe/python-soundfile +## SQLAlchemy (2.0.44) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Copyright 2005-2025 SQLAlchemy authors and contributors . + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Documentation`: https://docs.sqlalchemy.org + - `Homepage`: https://www.sqlalchemy.org + - `Issue Tracker`: https://github.com/sqlalchemy/sqlalchemy/ + + +## sqlitedict (2.1.0) + +### Licenses +License: `Apache 2.0` + + - `licenses/LICENSE.md`: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright (c) 2011-now `Radim Řehůřek `_ and contributors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Download`: http://pypi.python.org/pypi/sqlitedict + - `Homepage`: https://github.com/piskvorky/sqlitedict + + ## starlette (0.48.0) ### Licenses @@ -42572,7 +60101,306 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source`: https://github.com/Kludex/starlette -## strenum (0.4.15) +## statsmodels (0.14.5) + +### Licenses +License: `BSD License` + + - `LICENSE.txt`: +``` +Copyright (C) 2006, Jonathan E. Taylor +All rights reserved. + +Copyright (c) 2006-2008 Scipy Developers. +All rights reserved. + +Copyright (c) 2009-2018 statsmodels Developers. +All rights reserved. + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of statsmodels nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. +``` + + - `stats/libqsturng/LICENSE.txt`: +``` +Copyright (c) 2011, Roger Lew [see LICENSE.txt] +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the organizations affiliated with the + contributors or the names of its contributors themselves may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Bug Tracker`: https://github.com/statsmodels/statsmodels/issues + - `Documentation`: https://www.statsmodels.org/stable/index.html + - `Homepage`: https://www.statsmodels.org/ + - `Source Code`: https://github.com/statsmodels/statsmodels + + +## stevedore (5.6.0) + +### Licenses +License: `Apache-2.0` + + - `LICENSE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### URLs + - `Homepage`: https://docs.openstack.org/stevedore + - `Repository`: https://opendev.org/openstack/stevedore + + +## StrEnum (0.4.15) ### Licenses License: `MIT License` @@ -42744,7 +60572,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -The files under the directory sympy/parsing/latex +The files under the directory sympy/parsing/latex are directly copied from latex2sympy project and are licensed as: Copyright 2016, latex2sympy @@ -42773,6 +60601,44 @@ SOFTWARE. - `Source`: https://github.com/sympy/sympy +## tabledata (1.3.4) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2017-2024 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/tabledata/releases + - `Documentation`: https://tabledata.rtfd.io/ + - `Homepage`: https://github.com/thombashi/tabledata + - `Source`: https://github.com/thombashi/tabledata + - `Tracker`: https://github.com/thombashi/tabledata/issues + + ## tabulate (0.9.0) ### Licenses @@ -42806,6 +60672,43 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - `Homepage`: https://github.com/astanin/python-tabulate +## tcolorpy (0.1.7) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2020-2024 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/tcolorpy/blob/master/CHANGELOG.md + - `Homepage`: https://github.com/thombashi/tcolorpy + - `Source`: https://github.com/thombashi/tcolorpy + - `Tracker`: https://github.com/thombashi/tcolorpy/issues + + ## tenacity (9.1.2) ### Licenses @@ -43020,7 +60923,7 @@ License: `Apache 2.0` - `Homepage`: https://github.com/jd/tenacity -## tensorrt (10.13.3.9) +## tensorrt (10.13.3.9.post1) ### Licenses License: `Proprietary` @@ -43214,7 +61117,7 @@ Copyright - `Homepage`: https://github.com/nvidia/tensorrt -## tensorrt-cu13 (10.13.3.9) +## tensorrt_cu13 (10.13.3.9.post1) ### Licenses License: `Proprietary` @@ -43408,7 +61311,7 @@ Copyright - `Homepage`: https://github.com/nvidia/tensorrt -## tensorrt-cu13-bindings (10.13.3.9) +## tensorrt_cu13_bindings (10.13.3.9.post1) ### Licenses License: `Proprietary` @@ -43602,7 +61505,7 @@ Copyright - `Homepage`: https://github.com/nvidia/tensorrt -## tensorrt-cu13-libs (10.13.3.9) +## tensorrt_cu13_libs (10.13.3.9.post1) ### Licenses License: `Proprietary` @@ -43796,217 +61699,40 @@ Copyright - `Homepage`: https://github.com/nvidia/tensorrt -## tensorrt-llm (1.2.0rc1) +## threadpoolctl (3.6.0) ### Licenses -License: `None` +License: `BSD-3-Clause` - `licenses/LICENSE`: ``` +Copyright (c) 2019, threadpoolctl contributors - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -``` +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.``` +### URLs + - `Homepage`: https://github.com/joblib/threadpoolctl ## tiktoken (0.12.0) @@ -44260,10 +61986,10 @@ License: `Apache Software License` - `Source`: https://github.com/huggingface/tokenizers -## torch (2.8.0) +## torch (2.9.0+cu130) ### Licenses -License: `BSD-3-Clause` +License: `None` - `LICENSE`: ``` @@ -44290,12 +62016,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ``` -### URLs - - `Documentation`: https://pytorch.org/docs/ - - `Download`: https://github.com/pytorch/pytorch/tags - - `Forum`: https://discuss.pytorch.org/ - - `Homepage`: https://pytorch.org/ - - `Source`: https://github.com/pytorch/pytorch ## torchprofile (0.0.4) @@ -44332,16 +62052,16 @@ SOFTWARE. - `Homepage`: https://github.com/zhijian-liu/torchprofile/ -## torchvision (0.23.0) +## torchvision (0.24.0+cu130) ### Licenses -License: `BSD` +License: `None` - `LICENSE`: ``` BSD 3-Clause License -Copyright (c) Soumith Chintala 2016, +Copyright (c) Soumith Chintala 2016, All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44370,8 +62090,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` -### URLs - - `Homepage`: https://github.com/pytorch/vision ## tqdm (4.67.1) @@ -44379,57 +62097,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ### Licenses License: `MPL-2.0 AND MIT` - - `LICENCE`: + - `LICENSE`: ``` -`tqdm` is a product of collaborative work. -Unless otherwise stated, all authors (see commit logs) retain copyright -for their respective work, and release the work under the MIT licence -(text below). +MIT License -Exceptions or notable authors are listed below -in reverse chronological order: +Copyright (c) 2020 EleutherAI -* files: * - MPL-2.0 2015-2024 (c) Casper da Costa-Luis - [casperdcl](https://github.com/casperdcl). -* files: tqdm/_tqdm.py - MIT 2016 (c) [PR #96] on behalf of Google Inc. -* files: tqdm/_tqdm.py README.rst .gitignore - MIT 2013 (c) Noam Yorav-Raphael, original author. - -[PR #96]: https://github.com/tqdm/tqdm/pull/96 - - -Mozilla Public Licence (MPL) v. 2.0 - Exhibit A ------------------------------------------------ - -This Source Code Form is subject to the terms of the -Mozilla Public License, v. 2.0. -If a copy of the MPL was not distributed with this project, -You can obtain one at https://mozilla.org/MPL/2.0/. - - -MIT License (MIT) ------------------ - -Copyright (c) 2013 noamraph - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. ``` ### URLs @@ -44439,6 +62129,40 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - `wiki`: https://github.com/tqdm/tqdm/wiki +## tqdm-multiprocess (0.0.11) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2020 EleutherAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Homepage`: https://github.com/EleutherAI/tqdm-multiprocess + + ## transformers (4.56.0) ### Licenses @@ -44655,48 +62379,126 @@ Copyright 2018- The Hugging Face team. All rights reserved. - `Homepage`: https://github.com/huggingface/transformers -## triton (3.4.0) +## triton (3.5.0) ### Licenses License: `MIT License` - - `licenses/LICENSE`: + - `LICENSE.txt`: ``` -/* -* Copyright 2018-2020 Philippe Tillet -* Copyright 2020-2022 OpenAI -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` ### URLs - `Homepage`: https://github.com/triton-lang/triton/ -## typing-extensions (4.15.0) +## tritonclient (2.63.0) ### Licenses -License: `PSF-2.0` +License: `BSD` - - `licenses/LICENSE`: + - `LICENSE.txt`: +``` +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Homepage`: https://developer.nvidia.com/nvidia-triton-inference-server + + +## typepy (1.3.4) + +### Licenses +License: `MIT License` + + - `LICENSE`: +``` +MIT License + +Copyright (c) 2017-2024 Tsuyoshi Hombashi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Changelog`: https://github.com/thombashi/typepy/releases + - `Documentation`: https://typepy.rtfd.io/ + - `Homepage`: https://github.com/thombashi/typepy + - `Source`: https://github.com/thombashi/typepy + - `Tracker`: https://github.com/thombashi/typepy/issues + + +## typing_extensions (4.12.2) + +### Licenses +License: `Python Software Foundation License` + + - `LICENSE`: ``` A. HISTORY OF THE SOFTWARE ========================== @@ -45298,15 +63100,14 @@ SOFTWARE. - `Issue tracker`: https://github.com/urllib3/urllib3/issues -## uvicorn (0.37.0) +## uvicorn (0.38.0) ### Licenses License: `BSD-3-Clause` - `licenses/LICENSE.md`: ``` -Copyright © 2017, [Encode OSS Ltd](https://www.encode.io/). -Copyright © 2025, Marcelo Trylesinski +Copyright © 2017-present, [Encode OSS Ltd](https://www.encode.io/). All rights reserved. Redistribution and use in source and binary forms, with or without @@ -45342,34 +63143,260 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - `Source`: https://github.com/Kludex/uvicorn +## virtualenv (20.35.4) + +### Licenses +License: `MIT` + + - `licenses/LICENSE`: +``` +Copyright (c) 2020-202x The virtualenv developers + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Documentation`: https://virtualenv.pypa.io + - `Homepage`: https://github.com/pypa/virtualenv + - `Source`: https://github.com/pypa/virtualenv + - `Tracker`: https://github.com/pypa/virtualenv/issues + + ## wheel (0.45.1) ### Licenses License: `MIT License` - - `LICENSE.txt`: + - `vendored/packaging/LICENSE`: +``` +This software is made available under the terms of *either* of the licenses +found in LICENSE.APACHE or LICENSE.BSD. Contributions to this software is made +under the terms of *both* these licenses. ``` -MIT License -Copyright (c) 2012 Daniel Holth and contributors + - `vendored/packaging/LICENSE.BSD`: +``` +Copyright (c) Donald Stufft and individual contributors. +All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the "Software"), -to deal in the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following conditions: +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + + - `vendored/packaging/LICENSE.APACHE`: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS ``` ### URLs @@ -45379,6 +63406,41 @@ OTHER DEALINGS IN THE SOFTWARE. - `Source`: https://github.com/pypa/wheel +## word2number (1.1) + +### Licenses +License: `The MIT License (MIT)` + + - `licenses/LICENSE.txt`: +``` +The MIT License (MIT) + +Copyright (c) 2016 Akshay Nagpal (https://github.com/akshaynagpal) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +### URLs + - `Download`: https://github.com/akshaynagpal/w2n/tarball/1.1 + - `Homepage`: https://github.com/akshaynagpal/w2n + + ## xgrammar (0.1.25) ### Licenses @@ -45882,3 +63944,44 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. ### URLs - `Source`: https://github.com/jaraco/zipp + + +## zstandard (0.25.0) + +### Licenses +License: `BSD-3-Clause` + + - `licenses/LICENSE`: +``` +Copyright (c) 2016, Gregory Szorc +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + +### URLs + - `Documentation`: https://python-zstandard.readthedocs.io/en/latest/ + - `Homepage`: https://github.com/indygreg/python-zstandard diff --git a/requirements-dev.txt b/requirements-dev.txt index c8293761ea..e2ae04d955 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -31,3 +31,7 @@ ruff==0.9.4 lm_eval[api]==0.4.8 docstring_parser genai-perf==0.0.13 +opentelemetry-sdk>=1.26.0 +opentelemetry-api>=1.26.0 +opentelemetry-exporter-otlp>=1.26.0 +opentelemetry-semantic-conventions-ai>=0.4.1 diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 4958202da2..156cfc3d10 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -26,6 +26,7 @@ l0_a10: - unittest/_torch/models/checkpoints/hf/test_weight_loader.py - unittest/_torch/models/checkpoints/hf/test_checkpoint_loader.py - unittest/others/test_time_breakdown.py + - unittest/others/test_tracing.py - unittest/disaggregated/test_disagg_openai_client.py - unittest/disaggregated/test_disagg_utils.py - unittest/disaggregated/test_router.py diff --git a/tests/unittest/others/test_tracing.py b/tests/unittest/others/test_tracing.py new file mode 100644 index 0000000000..01da3716d3 --- /dev/null +++ b/tests/unittest/others/test_tracing.py @@ -0,0 +1,204 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import tempfile +import threading +from collections.abc import Iterable +from concurrent import futures +from typing import Callable, Dict, Generator, Literal + +import openai +import pytest +import yaml +from llmapi.apps.openai_server import RemoteOpenAIServer +from llmapi.test_llm import get_model_path +from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ExportTraceServiceResponse +from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import ( + TraceServiceServicer, + add_TraceServiceServicer_to_server, +) +from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue +from opentelemetry.sdk.environment_variables import OTEL_EXPORTER_OTLP_TRACES_INSECURE + +from tensorrt_llm.llmapi.tracing import SpanAttributes + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class FakeTraceService(TraceServiceServicer): + def __init__(self): + self.request = None + self.evt = threading.Event() + + def Export(self, request, context): + self.request = request + self.evt.set() + return ExportTraceServiceResponse() + + +# The trace service binds a free port at runtime and exposes its address via the fixture. +@pytest.fixture(scope="module") +def trace_service() -> Generator[FakeTraceService, None, None]: + executor = futures.ThreadPoolExecutor(max_workers=1) + import grpc + + server = grpc.server(executor) + service = FakeTraceService() + add_TraceServiceServicer_to_server(service, server) + # Bind to an ephemeral port to avoid conflicts with local collectors. + port = server.add_insecure_port("localhost:0") + service.address = f"localhost:{port}" + server.start() + + yield service + + server.stop(None) + executor.shutdown(wait=True) + + +@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"]) +def model_name(): + return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" + + +@pytest.fixture(scope="module", params=["pytorch"]) +def backend(request): + return request.param + + +@pytest.fixture(scope="module", params=[0], ids=["disable_processpool"]) +def num_postprocess_workers(request): + return request.param + + +@pytest.fixture(scope="module") +def temp_extra_llm_api_options_file(request): + temp_dir = tempfile.gettempdir() + temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml") + try: + extra_llm_api_options_dict = { + "enable_chunked_prefill": False, + "kv_cache_config": {"enable_block_reuse": False, "max_tokens": 40000}, + "return_perf_metrics": True, + } + + with open(temp_file_path, "w") as f: + yaml.dump(extra_llm_api_options_dict, f) + + yield temp_file_path + finally: + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + + +@pytest.fixture(scope="module") +def server( + model_name: str, + backend: str, + temp_extra_llm_api_options_file: str, + num_postprocess_workers: int, + trace_service: FakeTraceService, +): + model_path = get_model_path(model_name) + args = ["--backend", f"{backend}"] + if backend == "trt": + args.extend(["--max_beam_width", "4"]) + args.extend(["--extra_llm_api_options", temp_extra_llm_api_options_file]) + args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"]) + args.extend(["--otlp_traces_endpoint", trace_service.address]) + + os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" + + with RemoteOpenAIServer(model_path, args) as remote_server: + yield remote_server + + +FieldName = Literal["bool_value", "string_value", "int_value", "double_value", "array_value"] + + +def decode_value(value: AnyValue): + field_decoders: Dict[FieldName, Callable[[AnyValue], object]] = { + "bool_value": (lambda v: v.bool_value), + "string_value": (lambda v: v.string_value), + "int_value": (lambda v: v.int_value), + "double_value": (lambda v: v.double_value), + "array_value": (lambda v: [decode_value(item) for item in v.array_value.values]), + } + for field, decoder in field_decoders.items(): + if value.HasField(field): + return decoder(value) + raise ValueError(f"Couldn't decode value: {value}") + + +def decode_attributes(attributes: Iterable[KeyValue]): + return {kv.key: decode_value(kv.value) for kv in attributes} + + +@pytest.fixture(scope="module") +def client(server: RemoteOpenAIServer): + return server.get_client() + + +@pytest.fixture(scope="module") +def async_client(server: RemoteOpenAIServer): + return server.get_async_client() + + +@pytest.mark.threadleak(enabled=False) +def test_tracing(client: openai.OpenAI, model_name: str, trace_service: FakeTraceService): + messages = [ + {"role": "system", "content": "you are a helpful assistant"}, + {"role": "user", "content": "what is 1+1?"}, + ] + + temperature = 0.9 + top_p = 0.9 + max_completion_tokens = 10 + + chat_completion = client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=max_completion_tokens, + temperature=temperature, + top_p=top_p, + logprobs=False, + ) + + timeout = 10 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within the {timeout} seconds timeout" + ) + + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, but got {len(request.resource_spans)}" + ) + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, but got {len(request.resource_spans[0].scope_spans)}" + ) + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, but got {len(request.resource_spans[0].scope_spans[0].spans)}" + ) + + attributes = decode_attributes(request.resource_spans[0].scope_spans[0].spans[0].attributes) + + assert ( + attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) + == chat_completion.usage.completion_tokens + ) + assert ( + attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) + == chat_completion.usage.prompt_tokens + ) + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == max_completion_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE) == temperature + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0 + assert len(attributes.get(SpanAttributes.GEN_AI_RESPONSE_FINISH_REASONS)) > 0 From 6a39bb983ccc41a4ee1e8ccaf693eb95fe3ee497 Mon Sep 17 00:00:00 2001 From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> Date: Wed, 10 Dec 2025 03:07:34 +0000 Subject: [PATCH 048/172] [None][infra] Check in most recent lock file from nightly pipeline Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> --- .../examples/models/core/whisper/poetry.lock | 90 +++++++++---------- security_scanning/metadata.json | 4 +- security_scanning/poetry.lock | 46 +++++----- 3 files changed, 70 insertions(+), 70 deletions(-) diff --git a/security_scanning/examples/models/core/whisper/poetry.lock b/security_scanning/examples/models/core/whisper/poetry.lock index f13ed38ca8..abac1b6c4b 100644 --- a/security_scanning/examples/models/core/whisper/poetry.lock +++ b/security_scanning/examples/models/core/whisper/poetry.lock @@ -915,32 +915,32 @@ tests = ["matplotlib (>=3.5.0)", "packaging (>=20.0)", "pytest", "pytest-cov", " [[package]] name = "llvmlite" -version = "0.45.1" +version = "0.46.0" description = "lightweight wrapper around basic LLVM functionality" optional = false python-versions = ">=3.10" files = [ - {file = "llvmlite-0.45.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:1b1af0c910af0978aa55fa4f60bbb3e9f39b41e97c2a6d94d199897be62ba07a"}, - {file = "llvmlite-0.45.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02a164db2d79088bbd6e0d9633b4fe4021d6379d7e4ac7cc85ed5f44b06a30c5"}, - {file = "llvmlite-0.45.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f2d47f34e4029e6df3395de34cc1c66440a8d72712993a6e6168db228686711b"}, - {file = "llvmlite-0.45.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f7319e5f9f90720578a7f56fbc805bdfb4bc071b507c7611f170d631c3c0f1e0"}, - {file = "llvmlite-0.45.1-cp310-cp310-win_amd64.whl", hash = "sha256:4edb62e685867799e336723cb9787ec6598d51d0b1ed9af0f38e692aa757e898"}, - {file = "llvmlite-0.45.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:60f92868d5d3af30b4239b50e1717cb4e4e54f6ac1c361a27903b318d0f07f42"}, - {file = "llvmlite-0.45.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98baab513e19beb210f1ef39066288784839a44cd504e24fff5d17f1b3cf0860"}, - {file = "llvmlite-0.45.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3adc2355694d6a6fbcc024d59bb756677e7de506037c878022d7b877e7613a36"}, - {file = "llvmlite-0.45.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f3377a6db40f563058c9515dedcc8a3e562d8693a106a28f2ddccf2c8fcf6ca"}, - {file = "llvmlite-0.45.1-cp311-cp311-win_amd64.whl", hash = "sha256:f9c272682d91e0d57f2a76c6d9ebdfccc603a01828cdbe3d15273bdca0c3363a"}, - {file = "llvmlite-0.45.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:28e763aba92fe9c72296911e040231d486447c01d4f90027c8e893d89d49b20e"}, - {file = "llvmlite-0.45.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1a53f4b74ee9fd30cb3d27d904dadece67a7575198bd80e687ee76474620735f"}, - {file = "llvmlite-0.45.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b3796b1b1e1c14dcae34285d2f4ea488402fbd2c400ccf7137603ca3800864f"}, - {file = "llvmlite-0.45.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:779e2f2ceefef0f4368548685f0b4adde34e5f4b457e90391f570a10b348d433"}, - {file = "llvmlite-0.45.1-cp312-cp312-win_amd64.whl", hash = "sha256:9e6c9949baf25d9aa9cd7cf0f6d011b9ca660dd17f5ba2b23bdbdb77cc86b116"}, - {file = "llvmlite-0.45.1-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:d9ea9e6f17569a4253515cc01dade70aba536476e3d750b2e18d81d7e670eb15"}, - {file = "llvmlite-0.45.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:c9f3cadee1630ce4ac18ea38adebf2a4f57a89bd2740ce83746876797f6e0bfb"}, - {file = "llvmlite-0.45.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:57c48bf2e1083eedbc9406fb83c4e6483017879714916fe8be8a72a9672c995a"}, - {file = "llvmlite-0.45.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3aa3dfceda4219ae39cf18806c60eeb518c1680ff834b8b311bd784160b9ce40"}, - {file = "llvmlite-0.45.1-cp313-cp313-win_amd64.whl", hash = "sha256:080e6f8d0778a8239cd47686d402cb66eb165e421efa9391366a9b7e5810a38b"}, - {file = "llvmlite-0.45.1.tar.gz", hash = "sha256:09430bb9d0bb58fc45a45a57c7eae912850bedc095cd0810a57de109c69e1c32"}, + {file = "llvmlite-0.46.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4323177e936d61ae0f73e653e2e614284d97d14d5dd12579adc92b6c2b0597b0"}, + {file = "llvmlite-0.46.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a2d461cb89537b7c20feb04c46c32e12d5ad4f0896c9dfc0f60336219ff248e"}, + {file = "llvmlite-0.46.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b1f6595a35b7b39c3518b85a28bf18f45e075264e4b2dce3f0c2a4f232b4a910"}, + {file = "llvmlite-0.46.0-cp310-cp310-win_amd64.whl", hash = "sha256:e7a34d4aa6f9a97ee006b504be6d2b8cb7f755b80ab2f344dda1ef992f828559"}, + {file = "llvmlite-0.46.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:82f3d39b16f19aa1a56d5fe625883a6ab600d5cc9ea8906cca70ce94cabba067"}, + {file = "llvmlite-0.46.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a3df43900119803bbc52720e758c76f316a9a0f34612a886862dfe0a5591a17e"}, + {file = "llvmlite-0.46.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de183fefc8022d21b0aa37fc3e90410bc3524aed8617f0ff76732fc6c3af5361"}, + {file = "llvmlite-0.46.0-cp311-cp311-win_amd64.whl", hash = "sha256:e8b10bc585c58bdffec9e0c309bb7d51be1f2f15e169a4b4d42f2389e431eb93"}, + {file = "llvmlite-0.46.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b9588ad4c63b4f0175a3984b85494f0c927c6b001e3a246a3a7fb3920d9a137"}, + {file = "llvmlite-0.46.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3535bd2bb6a2d7ae4012681ac228e5132cdb75fefb1bcb24e33f2f3e0c865ed4"}, + {file = "llvmlite-0.46.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cbfd366e60ff87ea6cc62f50bc4cd800ebb13ed4c149466f50cf2163a473d1e"}, + {file = "llvmlite-0.46.0-cp312-cp312-win_amd64.whl", hash = "sha256:398b39db462c39563a97b912d4f2866cd37cba60537975a09679b28fbbc0fb38"}, + {file = "llvmlite-0.46.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:30b60892d034bc560e0ec6654737aaa74e5ca327bd8114d82136aa071d611172"}, + {file = "llvmlite-0.46.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6cc19b051753368a9c9f31dc041299059ee91aceec81bd57b0e385e5d5bf1a54"}, + {file = "llvmlite-0.46.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bca185892908f9ede48c0acd547fe4dc1bafefb8a4967d47db6cf664f9332d12"}, + {file = "llvmlite-0.46.0-cp313-cp313-win_amd64.whl", hash = "sha256:67438fd30e12349ebb054d86a5a1a57fd5e87d264d2451bcfafbbbaa25b82a35"}, + {file = "llvmlite-0.46.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:d252edfb9f4ac1fcf20652258e3f102b26b03eef738dc8a6ffdab7d7d341d547"}, + {file = "llvmlite-0.46.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:379fdd1c59badeff8982cb47e4694a6143bec3bb49aa10a466e095410522064d"}, + {file = "llvmlite-0.46.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e8cbfff7f6db0fa2c771ad24154e2a7e457c2444d7673e6de06b8b698c3b269"}, + {file = "llvmlite-0.46.0-cp314-cp314-win_amd64.whl", hash = "sha256:7821eda3ec1f18050f981819756631d60b6d7ab1a6cf806d9efefbe3f4082d61"}, + {file = "llvmlite-0.46.0.tar.gz", hash = "sha256:227c9fd6d09dce2783c18b754b7cd9d9b3b3515210c46acc2d3c5badd9870ceb"}, ] [[package]] @@ -1343,36 +1343,36 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] name = "numba" -version = "0.62.1" +version = "0.63.0" description = "compiling Python code using LLVM" optional = false python-versions = ">=3.10" files = [ - {file = "numba-0.62.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a323df9d36a0da1ca9c592a6baaddd0176d9f417ef49a65bb81951dce69d941a"}, - {file = "numba-0.62.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1e1f4781d3f9f7c23f16eb04e76ca10b5a3516e959634bd226fc48d5d8e7a0a"}, - {file = "numba-0.62.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:14432af305ea68627a084cd702124fd5d0c1f5b8a413b05f4e14757202d1cf6c"}, - {file = "numba-0.62.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f180922adf159ae36c2fe79fb94ffaa74cf5cb3688cb72dba0a904b91e978507"}, - {file = "numba-0.62.1-cp310-cp310-win_amd64.whl", hash = "sha256:f41834909d411b4b8d1c68f745144136f21416547009c1e860cc2098754b4ca7"}, - {file = "numba-0.62.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:f43e24b057714e480fe44bc6031de499e7cf8150c63eb461192caa6cc8530bc8"}, - {file = "numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:57cbddc53b9ee02830b828a8428757f5c218831ccc96490a314ef569d8342b7b"}, - {file = "numba-0.62.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:604059730c637c7885386521bb1b0ddcbc91fd56131a6dcc54163d6f1804c872"}, - {file = "numba-0.62.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d6c540880170bee817011757dc9049dba5a29db0c09b4d2349295991fe3ee55f"}, - {file = "numba-0.62.1-cp311-cp311-win_amd64.whl", hash = "sha256:03de6d691d6b6e2b76660ba0f38f37b81ece8b2cc524a62f2a0cfae2bfb6f9da"}, - {file = "numba-0.62.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:1b743b32f8fa5fff22e19c2e906db2f0a340782caf024477b97801b918cf0494"}, - {file = "numba-0.62.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:90fa21b0142bcf08ad8e32a97d25d0b84b1e921bc9423f8dda07d3652860eef6"}, - {file = "numba-0.62.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6ef84d0ac19f1bf80431347b6f4ce3c39b7ec13f48f233a48c01e2ec06ecbc59"}, - {file = "numba-0.62.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9315cc5e441300e0ca07c828a627d92a6802bcbf27c5487f31ae73783c58da53"}, - {file = "numba-0.62.1-cp312-cp312-win_amd64.whl", hash = "sha256:44e3aa6228039992f058f5ebfcfd372c83798e9464297bdad8cc79febcf7891e"}, - {file = "numba-0.62.1-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:b72489ba8411cc9fdcaa2458d8f7677751e94f0109eeb53e5becfdc818c64afb"}, - {file = "numba-0.62.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:44a1412095534a26fb5da2717bc755b57da5f3053965128fe3dc286652cc6a92"}, - {file = "numba-0.62.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8c9460b9e936c5bd2f0570e20a0a5909ee6e8b694fd958b210e3bde3a6dba2d7"}, - {file = "numba-0.62.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:728f91a874192df22d74e3fd42c12900b7ce7190b1aad3574c6c61b08313e4c5"}, - {file = "numba-0.62.1-cp313-cp313-win_amd64.whl", hash = "sha256:bbf3f88b461514287df66bc8d0307e949b09f2b6f67da92265094e8fa1282dd8"}, - {file = "numba-0.62.1.tar.gz", hash = "sha256:7b774242aa890e34c21200a1fc62e5b5757d5286267e71103257f4e2af0d5161"}, + {file = "numba-0.63.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:09534fd6e7a08a2b26c36449e62120563ed548c91c5ec5e00b10ecac8fb86460"}, + {file = "numba-0.63.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ff6ad346bf5010a02fedec7e7161947109cc45bedfacdad609d1d7d7aec34426"}, + {file = "numba-0.63.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91908f74c64ec0a2006a53b01bffd088d3289d403b063584197974fabc431aff"}, + {file = "numba-0.63.0-cp310-cp310-win_amd64.whl", hash = "sha256:d900bee63b2546352f3bbb533beb74f9825c8f58afb80632625a2d9606ea56af"}, + {file = "numba-0.63.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94ae468e1a1ff9b6f8b8e6920caa46f353bed7c088077912a310f737966147cb"}, + {file = "numba-0.63.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a801fff99a7ccdd79405b061f9b624234b84263c5e2a5a38408e8fb19fc3a243"}, + {file = "numba-0.63.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0627914cc99e4b1ed386b74f81c819c9bc67fb4a0447695309881529fa534d2"}, + {file = "numba-0.63.0-cp311-cp311-win_amd64.whl", hash = "sha256:b24bfffc2e877581ff13cc3e041f69224939c616dd2b1ef20cd856e743ad66fd"}, + {file = "numba-0.63.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7e15588b2e3f4ea8c74d294d0a9f3b8262e7a34c4c5b4f5850d5779334a13d20"}, + {file = "numba-0.63.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2279cbcd459440eca36be1078ad14e96a7a339124065a83577feb8849d28453d"}, + {file = "numba-0.63.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18add2794439ae7289f1836ef73901b2a9e2c1fd91f7389af0c447626955519d"}, + {file = "numba-0.63.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5343da27ce538bf60b05397d378ae71e16c7fea99a5973a7f42eb51d3471e20"}, + {file = "numba-0.63.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:26859efd087b9eafc3da450a69a0211903622f723fce2da9aef84f04c01f804b"}, + {file = "numba-0.63.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:491fd265dd3ca837d31486fd688ec5331eb79213160c88fcb18555d128dc99ac"}, + {file = "numba-0.63.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2dd16fceaffff2f6f36abe9f03291e1d732b7d0978524a315f1ea9ee380d1859"}, + {file = "numba-0.63.0-cp313-cp313-win_amd64.whl", hash = "sha256:78ff9d00ab3374f87683bf902195c990049cdea7dd2d82d09b5a9fcdb68f6ae0"}, + {file = "numba-0.63.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:7593766bd0210d8dece4915de8a9e5393e93fdc6ad1ca8576555645a926cfe3a"}, + {file = "numba-0.63.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1c1d3bd8757d2ee674ba8b20b34c1f5aa5d40ba4e5d6424d608e8e880deeb7b2"}, + {file = "numba-0.63.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0f30d46a9e4b7699cdeeb020e1e323352266f147f1b8a3feb0d4d37cde2fddd"}, + {file = "numba-0.63.0-cp314-cp314-win_amd64.whl", hash = "sha256:34f59efe05e5237ed8cd4d592303467eac5b8fdc6fac716542140e6bcb5f9d7c"}, + {file = "numba-0.63.0.tar.gz", hash = "sha256:27e525ce6f9f727c4f61e89b9d453d4a7d0aabbbf110278988334f43cbd70fdc"}, ] [package.dependencies] -llvmlite = "==0.45.*" +llvmlite = "==0.46.*" numpy = ">=1.22,<2.4" [[package]] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 9eacde56c2..6f015659d4 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "d6f961d3fe1de4853ce99c91e2c69d205e35698c", - "timestamp": "2025-12-09T02:39:28Z" + "commit_hash": "36c9e7cfe670db782d69f37bcc772baaa5c86ff1", + "timestamp": "2025-12-10T02:39:25Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index 18da005341..36dbdfcc91 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -742,29 +742,29 @@ files = [ [[package]] name = "cuda-bindings" -version = "13.1.0" +version = "13.1.1" description = "Python bindings for CUDA" optional = false python-versions = ">=3.10" files = [ - {file = "cuda_bindings-13.1.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9dba661b641c4559ae28b3bd17620bdee469aec706feafcf666853c1b2df35eb"}, - {file = "cuda_bindings-13.1.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:122d3f0dde611e84bb0448cb2a1760b3022da5a09b24cc27ec6403fe3d3ebcb5"}, - {file = "cuda_bindings-13.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:7949148190c0611b3eae87698a8778ab4d1c1f220a066c83de01251a6ce577ae"}, - {file = "cuda_bindings-13.1.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f099d5e7e952ddafc62a1abad1ae30f46fcd3ebe683bac628c927d8050779e3"}, - {file = "cuda_bindings-13.1.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e4e4e396ebb051bd268ec3f97c7cf350fbf2eee8ff5cf0b572218ea52fde0960"}, - {file = "cuda_bindings-13.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:556867e171ada3bd7716133f9b689a89c5486110757d44f0884e0f1c1cf5cb98"}, - {file = "cuda_bindings-13.1.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f7a12d696987854b04a184c4e5e61f043b56bb94469d27c546a8aec55d9aa6be"}, - {file = "cuda_bindings-13.1.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f63214291d316e18152e277edc1f159083a931f9936e2e3d854da8155a5f0f7"}, - {file = "cuda_bindings-13.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:61510bfda2d4cd6afb589d39806f82e184beced26a5d8a12db10770ccbc99754"}, - {file = "cuda_bindings-13.1.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d3bdbb1b860f37450b6280d6a4d27716b952f480ef0be6daa27d04d9c4825ac"}, - {file = "cuda_bindings-13.1.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:72b4236637ece577b0dc8528a6a60beecb19e64ca9c6818386f982200e461057"}, - {file = "cuda_bindings-13.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:34181796a768cd7bb6a2e7407db76eed42a0d7e48a4b24aed502e9b485fcb0d5"}, - {file = "cuda_bindings-13.1.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12106d5bb73fff4c25740ae1bd56af9630306a7fff6a700de54015083ba3831b"}, - {file = "cuda_bindings-13.1.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80cccf2ce884f70d7d41d34fe622b4a8e6cadcc11ce943c7c5bedfb9285ac61c"}, - {file = "cuda_bindings-13.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:2fa92fad8c21cdeaa2e58a733e5013a7377840e2fbc0239757409141b90704c2"}, - {file = "cuda_bindings-13.1.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a73affa8b33e677b876cd570120023497a43a1045303a3ebf152749d0c5a93"}, - {file = "cuda_bindings-13.1.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7803f0a2b9085eb44805c8226907b7091d3afba5453c1c1ef6710f0151521ddb"}, - {file = "cuda_bindings-13.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2ac82549be8c9b4490f876caaeff5be95dd29147832982d2a4cccb906dab8373"}, + {file = "cuda_bindings-13.1.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4400370a83f1538e25ed4c18c34a0e9d5fad39741e282e69ce24d1479a11017d"}, + {file = "cuda_bindings-13.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f92500e2f6aec2dac00a5a1ce77d5aa77ea77b606dc484d951f1f2cc3eaa13"}, + {file = "cuda_bindings-13.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:3f5bb8190267216f96597235252087accac4cbccefd1b60756cced114b2d6754"}, + {file = "cuda_bindings-13.1.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a5f5a6ade0ad45096568bc4dd1eb3377b65884d29124338fe9a4353130ef6631"}, + {file = "cuda_bindings-13.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9348f69b03b257f07159dd4c869615e139722c2bd81e96c66f6b8f77615efd82"}, + {file = "cuda_bindings-13.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:ec33b84f4bd65a86a734427f2b9cb8f221bedab2c4cfb681488cabc82f1d64ab"}, + {file = "cuda_bindings-13.1.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e75ad0cb863330df784236d289612d71ca855c013d19ae00e5693574abd6915"}, + {file = "cuda_bindings-13.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25785d1a3cdcd98f151240fd5efd025609319a6720a217dee2a929241749d488"}, + {file = "cuda_bindings-13.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:ccde9c95c0e953b31fe7731bb08da9d0a34b1770498df9a3c156fdfdbe3951ad"}, + {file = "cuda_bindings-13.1.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c9822a57c8f952dc367aacd7c32fe4cb17371104383606f455ea74635bff4c7"}, + {file = "cuda_bindings-13.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5837f5ea422c5653626dcfe22e9ab68142cd19af9e67a226100f224cc25a1b99"}, + {file = "cuda_bindings-13.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:9e4f348cd7a779657d51e6f71aac3965fb1738f40ff3bbe75265a3242fd6f29f"}, + {file = "cuda_bindings-13.1.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:86258fe1b0d3998bea7f57dc891569e4996705b8dd00366e44c722d0a29b2090"}, + {file = "cuda_bindings-13.1.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:daf8468fd603b2724c2d16cbd499348c64916ed72b1d04643f1660ce13cd12ae"}, + {file = "cuda_bindings-13.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:f2e079182014dbc162562b46467815272c14c7afe5b988978fa968728b0ac726"}, + {file = "cuda_bindings-13.1.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d0cd96a6ec00a78235947bff9462b2139bc5b83ce8e297d865802f0b52d1e23d"}, + {file = "cuda_bindings-13.1.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ff465829c6c394c2b4047250324a19925cf8c44633345b2746a4741e07bf827"}, + {file = "cuda_bindings-13.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:8205eee6b8b458a2110c0384923ace206855d0f1b436fc1b145fcbaa1653b501"}, ] [package.dependencies] @@ -785,20 +785,20 @@ files = [ [[package]] name = "cuda-python" -version = "13.1.0" +version = "13.1.1" description = "CUDA Python: Performance meets Productivity" optional = false python-versions = ">=3.10" files = [ - {file = "cuda_python-13.1.0-py3-none-any.whl", hash = "sha256:19ce93ab3c8b2116ebe23c87fe023d82df0766af4f956582a42d3482a2787e33"}, + {file = "cuda_python-13.1.1-py3-none-any.whl", hash = "sha256:944cc4fe6482673d28dd545797a28840945a1668739328fa2ad1e9be4f7050d9"}, ] [package.dependencies] -cuda-bindings = ">=13.1.0,<13.2.0" +cuda-bindings = ">=13.1.1,<13.2.0" cuda-pathfinder = ">=1.1,<2.0" [package.extras] -all = ["cuda-bindings[all] (>=13.1.0,<13.2.0)"] +all = ["cuda-bindings[all] (>=13.1.1,<13.2.0)"] [[package]] name = "cuda-toolkit" From 9d3c675a0bb590c295bb1b69318c38bbeca1b3a4 Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Wed, 10 Dec 2025 11:10:55 +0800 Subject: [PATCH 049/172] [None][chore] Support larger topK for NVLinkOneSided AlltoAll. (#9816) Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- .../moeAlltoAllKernels.cu | 118 ++++++++++++++++++ .../communicationKernels/moeAlltoAllKernels.h | 7 +- 2 files changed, 121 insertions(+), 4 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index a92558da47..62c25ce3ca 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -45,6 +45,18 @@ namespace tensorrt_llm::kernels::moe_comm #define SWITCH_TOP_K(top_k, TOP_K, ...) \ switch (top_k) \ { \ + case 16: \ + { \ + constexpr int TOP_K = 16; \ + __VA_ARGS__; \ + break; \ + } \ + case 10: \ + { \ + constexpr int TOP_K = 10; \ + __VA_ARGS__; \ + break; \ + } \ case 8: \ { \ constexpr int TOP_K = 8; \ @@ -611,6 +623,90 @@ __device__ void vectorized_combine_impl( // Load directly into the per-k accumulator; reduce across k below acc[k].load(recv_buffer + base_token + offset); } + if constexpr (TOP_K == 16) + { + T* a0 = reinterpret_cast(&acc[0]); + T* a1 = reinterpret_cast(&acc[1]); + T* a2 = reinterpret_cast(&acc[2]); + T* a3 = reinterpret_cast(&acc[3]); + T* a4 = reinterpret_cast(&acc[4]); + T* a5 = reinterpret_cast(&acc[5]); + T* a6 = reinterpret_cast(&acc[6]); + T* a7 = reinterpret_cast(&acc[7]); + T* a8 = reinterpret_cast(&acc[8]); + T* a9 = reinterpret_cast(&acc[9]); + T* a10 = reinterpret_cast(&acc[10]); + T* a11 = reinterpret_cast(&acc[11]); + T* a12 = reinterpret_cast(&acc[12]); + T* a13 = reinterpret_cast(&acc[13]); + T* a14 = reinterpret_cast(&acc[14]); + T* a15 = reinterpret_cast(&acc[15]); +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a1[j]; + a2[j] += a3[j]; + a4[j] += a5[j]; + a6[j] += a7[j]; + a8[j] += a9[j]; + a10[j] += a11[j]; + a12[j] += a13[j]; + a14[j] += a15[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a2[j]; + a4[j] += a6[j]; + a8[j] += a10[j]; + a12[j] += a14[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a4[j]; + a8[j] += a12[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a8[j]; + } + } + else if constexpr (TOP_K == 10) + { + T* a0 = reinterpret_cast(&acc[0]); + T* a1 = reinterpret_cast(&acc[1]); + T* a2 = reinterpret_cast(&acc[2]); + T* a3 = reinterpret_cast(&acc[3]); + T* a4 = reinterpret_cast(&acc[4]); + T* a5 = reinterpret_cast(&acc[5]); + T* a6 = reinterpret_cast(&acc[6]); + T* a7 = reinterpret_cast(&acc[7]); + T* a8 = reinterpret_cast(&acc[8]); + T* a9 = reinterpret_cast(&acc[9]); +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a1[j]; + a2[j] += a3[j]; + a4[j] += a5[j]; + a6[j] += a7[j]; + a8[j] += a9[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a2[j]; + a4[j] += a6[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a4[j]; + a0[j] += a8[j]; + } + } // Reduce acc[TOP_K] into acc[0] if constexpr (TOP_K == 8) @@ -643,6 +739,28 @@ __device__ void vectorized_combine_impl( a0[j] += a4[j]; } } + else if constexpr (TOP_K == 6) + { + T* a0 = reinterpret_cast(&acc[0]); + T* a1 = reinterpret_cast(&acc[1]); + T* a2 = reinterpret_cast(&acc[2]); + T* a3 = reinterpret_cast(&acc[3]); + T* a4 = reinterpret_cast(&acc[4]); + T* a5 = reinterpret_cast(&acc[5]); +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a1[j]; + a2[j] += a3[j]; + a4[j] += a5[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a2[j]; + a0[j] += a4[j]; + } + } else if constexpr (TOP_K == 4) { T* a0 = reinterpret_cast(&acc[0]); diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h index 7361f9a8d9..93e6508253 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h @@ -23,10 +23,9 @@ namespace tensorrt_llm::kernels::moe_comm { // Configuration constants -static constexpr int kMaxExperts = 256; // Maximum number of experts per rank -static constexpr int kMaxTopK = 8; // Maximum top-k experts per token -static constexpr int kMaxPayloads = 8; // Maximum number of different payload types -static constexpr int kMaxRanks = 64; // Maximum supported EP size +static constexpr int kMaxTopK = 16; // Maximum top-k experts per token +static constexpr int kMaxPayloads = 4; // Maximum number of different payload types +static constexpr int kMaxRanks = 64; // Maximum supported EP size // Describes a single payload type to be communicated struct PayloadDescriptor From 2c46126a937d71562e3c356bc17b1b2ff5670ae5 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 10 Dec 2025 11:54:51 +0800 Subject: [PATCH 050/172] [TRTLLM-9794][ci] move some deepseek test cases to gb200 (#9841) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- .../test_lists/test-db/l0_dgx_b200.yml | 15 --------------- .../test_lists/test-db/l0_gb200_multi_gpus.yml | 14 +++++++++----- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index f46e61aa7a..f0c2b3131c 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -21,21 +21,6 @@ l0_dgx_b200: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index 2d710b2888..ef59faeb57 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -23,12 +23,21 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] @@ -68,16 +77,11 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90) From 979f37e4433aaf80255096909e5e79e7eb94c058 Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Wed, 10 Dec 2025 12:09:53 +0800 Subject: [PATCH 051/172] [None][fix] Fix nvfp4 gemm allowed backends arg passing (#9837) Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/models/modeling_auto.py b/tensorrt_llm/_torch/models/modeling_auto.py index 5788a9b2a5..ff48edc5cb 100644 --- a/tensorrt_llm/_torch/models/modeling_auto.py +++ b/tensorrt_llm/_torch/models/modeling_auto.py @@ -43,7 +43,7 @@ class AutoModelForCausalLM(Generic[TModel, TConfig]): config._frozen = False config.skip_create_weights_in_init = True config._frozen = True - extra_attrs = {} + extra_attrs = config.extra_attrs with model_extra_attrs(extra_attrs): model = cls(config) model.extra_attrs = extra_attrs From 0e78a4b24432b12efa30f16557e9ac62154cb5db Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Wed, 10 Dec 2025 14:01:44 +0800 Subject: [PATCH 052/172] [https://nvbugs/5702791][fix] Unwaive fixed test (#9844) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 5b75702332..a7fdd1f449 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -384,7 +384,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mt accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5701425) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5698897) unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421) -unittest/llmapi/test_llm_pytorch.py::test_embedding_bias_with_torch_sampler_strategies SKIP (https://nvbugs/5702791) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] SKIP (https://nvbugs/5702795) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] SKIP (https://nvbugs/5702795) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] SKIP (https://nvbugs/5702795) From 49fe089470a6a81022b5aa2e49ebfa6ae2dd0c94 Mon Sep 17 00:00:00 2001 From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Date: Wed, 10 Dec 2025 16:18:11 +0800 Subject: [PATCH 053/172] [TRTLLM-9811][infra] Update urllib3 version >= 2.6.0 to fix high vulnerability issue (#9823) Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> --- constraints.txt | 3 +++ docker/Dockerfile.multi | 4 ++++ docker/common/install_base.sh | 2 +- jenkins/current_image_tags.properties | 8 ++++---- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/constraints.txt b/constraints.txt index d4b78a2567..9cea8d00a9 100644 --- a/constraints.txt +++ b/constraints.txt @@ -1,2 +1,5 @@ # These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image # is updated. +# WAR against https://github.com/advisories/GHSA-gm62-xv2j-4w53 +# WAR against https://github.com/advisories/GHSA-2xpw-w6gg-jr37 +urllib3>=2.6.0 diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 3d5aee7268..74e18b2cd2 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -71,6 +71,10 @@ RUN GITHUB_MIRROR=${GITHUB_MIRROR} \ rm install_pytorch.sh && \ rm install.sh +# Copy and install dependencies from constraints.txt +COPY constraints.txt /tmp/constraints.txt +RUN pip3 install --no-cache-dir -r /tmp/constraints.txt && rm /tmp/constraints.txt + # Install UCX, NIXL, etcd # TODO: Combine these into the main install.sh script RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && \ diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh index b7c3f01d27..99ec57e2e1 100644 --- a/docker/common/install_base.sh +++ b/docker/common/install_base.sh @@ -119,7 +119,7 @@ install_python_rockylinux() { } install_pyp_rockylinux() { - bash -c "pip3 install 'urllib3<2.0' pytest" + bash -c "pip3 install pytest" } install_gcctoolset_rockylinux() { diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index a7ae94d2d3..ed5f0078bd 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512081220-9584 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512081220-9584 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512081220-9584 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512081220-9584 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512091705-9823 From 12693a526bd8b531f732739a7f419d121be82353 Mon Sep 17 00:00:00 2001 From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Date: Wed, 10 Dec 2025 17:11:32 +0800 Subject: [PATCH 054/172] [None][chore] Enable L0 multi-gpus testing for Qwen3-next (#9789) Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> --- jenkins/L0_MergeRequest.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index e3c80bf48c..234d94c690 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -718,6 +718,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars) "tensorrt_llm/_torch/custom_ops/torch_custom_ops.py", "tensorrt_llm/_torch/custom_ops/userbuffers_custom_ops.py", "tensorrt_llm/_torch/models/modeling_llama.py", + "tensorrt_llm/_torch/models/modeling_qwen3_next.py", "tensorrt_llm/_torch/modules/fused_moe/", "tensorrt_llm/_torch/pyexecutor/_util.py", "tensorrt_llm/_torch/pyexecutor/model_engine.py", From e34302986d3935a2107b4ce698f163b33b03b39b Mon Sep 17 00:00:00 2001 From: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com> Date: Wed, 10 Dec 2025 17:47:03 +0800 Subject: [PATCH 055/172] [https://nvbugs/5727952][fix] PDL bugs with trtllm-gen fmha kernels (#9863) Signed-off-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com> --- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 2 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- .../fmha/cubin/kernelMetaInfo.h | 3890 ++++++++--------- .../trtllmGenKernels/fmha/kernelParams.h | 3 + 1946 files changed, 5835 insertions(+), 5832 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 05023fc740..164af32324 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:331aaf5e84f39f9ce4940fce18d646701f80caf6681d8ba1244934171baf9d03 -size 616196 +oid sha256:93da3db51dc2c74c43549079c18247be8c423165c425be80c9b4171624c8a1d7 +size 646191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c180284f18..42a02cf931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e93cb23f1ee61233c61091dc880258c59fa006abb5950cc6c8e1a99da2537845 -size 551858 +oid sha256:d49a0d4280f06012123d03c1f19b2c21eb0884b4621530741c23255c5b1857b9 +size 590830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b7e30e8d60..3c15b9800b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:748f8edf49b35d4c0502d3a292f11a53673d224539f8a94e2f9724bf17b8502b -size 605146 +oid sha256:abf634eaf82d7c72b7202706c41bbbf1027c049bcd78e899207c77b344b5d47e +size 626655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0428110055..ca022b7624 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4ba7f26e6cb3e11b76321d6539bf2e3d194908058ae296a5b4c3ecb36fdfdf3 -size 540806 +oid sha256:79083440bb77f1423d1b3c33e6ff6e8434a9e95fb803f7a2dfaea1f972eb62c4 +size 574254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ce0b43ff0a..0003212fd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:799f27d110ed5c5b76d30c39a945d1bf8a28078be2a9dbf35db18a27a8f608dd -size 466054 +oid sha256:56be9d2c764a380d45775cb93ebdc59cf6840267a695ad7910306c316d6e5a6d +size 490794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c918735f2e..74349949b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2e8a5d62a02d3ba248b1a19bfbf9d0cd11674705283022033f5243b98ee72cd -size 432382 +oid sha256:27fa306c15c04506f2eeb49456ad1920dc3a0ec06cf142f82dab0d3b6d8a5e51 +size 457936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 049e4c8f46..7a71bd7c80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a21b45df44b576b61dff0cd0e89caeb324971601c3b458cfc72f32640a76eee4 -size 456106 +oid sha256:f05b47b2541185087b3d3502a98b6e1fcc50f10ea4f5fbcb173f0a7063392a7b +size 481636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a82eacf083..5acefe2718 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:281d59d446597519bb0fdd6fd5b46cc9a69fe5ede08fb6b46426cef8bf5b1327 -size 427984 +oid sha256:5b7ad324102fa900c47b2cb8feaca95cd332b84eea0e6ced413d7da86103ff8e +size 454304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 902283593b..b77af8541e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fa67e1282156d5bce4db0dcd6a488b0e68abd2281d6351e71ed2d9cc442105f -size 611458 +oid sha256:a140c2e2014fba58b2789fc18349a9d8ccd361448ff59179ae5839daa414e477 +size 638789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0cc98a593f..43642aa183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f322ae84000c0990a01b0eaab20450cbddf8240fed8d69c77396561687559ba -size 550350 +oid sha256:154807c5077953c2a5585410562d2a4f5ebf0b1b014f4a32ff70e1b65b193a1c +size 586906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 433e374a8e..549c4881a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8591279735daf4e616f620a0e4168b248d2f6505c1e69269055086f20826713c -size 435916 +oid sha256:983c3043a48bae92ac6e564c62041d37fbeb5995bfa5e1cf87030dcee8f63f17 +size 447238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 286d8bffef..bebeccda29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fedf3acc531073c068c75a502b0fcfe0ab72a4c8a74738c1359a40b306c7d71 -size 377152 +oid sha256:712e5ec1c49ccfc014430442ab3944544bd32724f3372956afef4eab55548bec +size 396366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 76f308000c..c5a5f3141b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:100eb03cb08b71dc45f37bab8d009daa24aedc37f33a6279e47568dc9f878195 -size 415686 +oid sha256:6badebbedf2079dc2f50340cb401545a1592de7093c79d3e08c3d27fa1191391 +size 429376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 14697fa1e5..db4881629a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65e4966f977dba8575f0cb5f023b77878cafcfb1294fa35a481759f7285b85de -size 356922 +oid sha256:0480d9b5c03425111629b47a6660b071e2b49b70417a56106d01d077a4aa7a4d +size 378506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 053f09e623..30cfb4a22e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d459c6fd38a1dbd1f0487979a9a200943b194b8d7be5d97b3946d7143f977950 -size 455792 +oid sha256:e7051e49c82e0d10102950a887d639c218b16e20d266cc3d7bb702154a299339 +size 474218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5d9194c1f6..b690bc6ee8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a2c8d5224615543b71f1d21b787b308a7628f83765ea6dca8785948c3051433 -size 421330 +oid sha256:79a038482959da988e9ccf405591b1e5bc2f02213f3bd3590f44f32ca1eb36ce +size 441360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3ac02ab271..e901d16033 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf234f00c09a023d6c41428a610989d2f94a87633a0c85adf5b3594dcb89354f -size 445056 +oid sha256:ce5c3c5fc0a318747990414fb172712b83c28d0ed501309a7cd2fbe786b67f9f +size 465060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8bdaa59cef..8cc3153d4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dd66ceec03e32e2fbfacea619139f8937d95249b2520fec1e7bbf939208cbf0 -size 416934 +oid sha256:a0643f9373aa0a579efbd7b01b967e535534f6c123057e954a7eff2bef1d2933 +size 437728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c31381b5fd..102aa4b54b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d51517abcced73cf483d80428ba1950995691aff3b94706af5ebf37833fdb7d5 -size 600408 +oid sha256:40bde6ad280f45eb133eee398de57aa841d438ae72ada024a77374126d30faa7 +size 621917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1796a0b7a5..64e3aa0143 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e67116bdcf5ecb2fac46dc06056d7f48798652b7df16eb17d1f6dd620f87362e -size 539300 +oid sha256:72f04b54572591e5cf0539628ba89b2ae45fc43f38f764590b60dfed291a7415 +size 571910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 75ab9b1774..bb0d58ab92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be3283948cb5ed8da96c899dda4623d381380356d1986a00a9e2d1d04ab12756 -size 424864 +oid sha256:133585e8799982a224109cd76c24057f64c570c5815a163c75a391ac2b2ff6e6 +size 430662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 013e6763f6..41f6b44892 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8b8de657d2cc17016d83a123362d7dd837a143c1c03c76824e21741e71ad72c -size 366100 +oid sha256:10dd0e900165da3946a290fa7644b3965c679b85e4512c836b3cb6f88e53a53a +size 379790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b78311c1a5..1826f73e87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1b96a8fdd69872ae0702c8019edd47805246c5ec1267b4916fa8588f6a3643b -size 404636 +oid sha256:97bf0b35d7adcae83e2614b287eec3477407910591747fb9bf7a8f9d2e4100dd +size 412800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a9572405f9..dbe9a19ca2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73ac5b7c1c31890097cfcf8dcd623be91ac12189c18855bb0b90100162689ec1 -size 345872 +oid sha256:38c475483ad75da7821f3a907a4f2f0cf0bf57630f8908ce035daa75b92d79e9 +size 361140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 78fe682a68..a950f9702b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25eef31d38e7dd13fc4c8b5ddad1414c503cda6bbbb486f2849d900ef28fe4b8 -size 486610 +oid sha256:065d4ceb129aee11bced17a64c72d8bdbc49213705e70f6e2fbbf9fa038533cb +size 505826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7b7faa847e..2d5cfcbdd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d30bcc35d5effbd6be667af3d19bf8b38d021a52a2f4d3e417b2e948e469ed56 -size 452148 +oid sha256:f8089c7c268bc3a172f4cb14a6ca3e670fa50ca3b3fcbd329ba86f9f9c9e175a +size 471388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index da58eab3d1..bc927e6ca9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fba723537418085d113c273b18866dbfb777133913e86d506bbfc2340bc76981 -size 475874 +oid sha256:8a1f079dd99ee5f17825c71799f6a7a0b36e89398e16a9b93ee23c9923ad05ca +size 495878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c845bca0b2..8a43bde3d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17d90eec0a59bdc7e4d40d08f03c3c55247aa6349fc2c1fa27b563dac75e8e0a -size 446962 +oid sha256:ed86e9c733c048bd520786a10cf1620bc5c3071d4d84532ff2c904790cd60b02 +size 466968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 59cadae006..b8bd776ea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:caf3c99ba08d92571f2f5995a01b6c89a55b03f1ac6c6d1569286d1d6fae40a7 -size 655005 +oid sha256:96f9657094acc07b98d2ee41bf79e7e397a2ff1a66254a2d70040d1f12894a92 +size 701575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 34807221d3..beceafcfe0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8915c45e30aff50cc46d42ec34e3a0fa8a2401bf39a4f1a899d9c06a09829127 -size 592392 +oid sha256:51c90f7a06bde3f283e4fa78ede68f390a2d74a1b742827ed28bfa53ba0af38f +size 644193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4d4bdbffcf..dd6c7978ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ff83cd863b24e8ce157896497a2d001a3b30ca28037bd64f812748b2d55083a -size 459530 +oid sha256:8d5f6eca62c7644bcce981f107b09dd82e95d3e1f8b5a1dda38fd626db0eb411 +size 481878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4fccb3348e..a8d6936fa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5bf9b569886ae544bf989033e8917cb833b316523f84c409ad27be513b9d118 -size 389692 +oid sha256:1aabf34bc8b83d18d0c34f14bbb905247e02e38bb5e423a1e040fb3458764e8b +size 400224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 54f951fe33..c1963d3644 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3fc291858b0c0af5b3a0694bfd3aafc0b537aaff3540a28aab7484d39dc72b5 -size 435356 +oid sha256:327df4bcaf3e7bff22742d558bfba0e0fecb7ab3da6c79a82a74665d357385ec +size 457704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 35f6573fe1..2312e17f5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c66792149402a125e0720afd5367e371346f80a87740f1e1f9e280ba48935124 -size 368674 +oid sha256:57df5394b6e47321c21553a330b45b4003e127196103eecc23921ac165f83d13 +size 380784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a498eae88c..260fead36e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c503e3e00848c097001c73b80f98efdb7341970ccf79a258a15e95d93192a12 -size 475560 +oid sha256:60f3586c65a1c05a88e0729da3bb6e6cbc48f6bebd2566efefe82b4bbe1e2cc3 +size 489250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 855ad055ba..c34084fc46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c883a1e96d3601bdabba5a536e32a64492de256f9a78a3e5473b5a3973318e7d -size 441098 +oid sha256:0af68c71338072381aa01878db5a816156b518109a058331928d26d0e100548b +size 454812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c99d92f9be..d095f76ec7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a47c590c342a1dc967f6a8cc46fa88087f43db885d8cf0b5c3523a20792271f -size 464824 +oid sha256:95e0c23f48a26672e87787326ec4c21b2d88203a7ae621da3db7eb4c0e5c38ff +size 479302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3bc659992c..e8b01023bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dd8a6839f0c69a14b7c96bf65903fd19dabadd8e4ceb1b7a61bd79b326de697 -size 435912 +oid sha256:e9d4b8fc32aadbd4ac6a3e379fb72cd5b6be96977205b301c0dc6468a3fd5274 +size 450392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d21acc038b..e1df63ebd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b66b9f3c286a06aad56e755d623309acd2b2f041bf65744d2b1e0044c1610a53 -size 643955 +oid sha256:4593f7e2d25319baac5ee97eba18cdbe22c5a3dda77c6fafc0fbc4c16f163988 +size 682039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a4af6707cb..b9bc402575 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9756e332b460658e92f6fe698cc7e63c9a06ceaeef0e012d8a26161bcf4d2737 -size 581342 +oid sha256:1b812ec9d0a8259418346cab34054a359eef4bad1d97abc6ae2f34c854244f92 +size 626827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 506d38acc6..a33972ce44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5c5c4d3386e2b9963e815ee773f3f58677291252c2f0038b961d610d8644141 -size 446902 +oid sha256:8afe73f4880ce3db60376c3da8e23b7d1d7b9d43243b8f4dc7242fb6f8b0c56c +size 460566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 026901ed28..5fa2d29a3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f56fcd19a0645041db6c7ddf5d2b72274b3409bf2890e63594a6df18c951dcd -size 378640 +oid sha256:165f4d266232b757878fb4ddfa5e9ca8b6eaf859de430d9db34aa94bcc26753d +size 385226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 08c69a1fc4..ad8f72b00d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d9d472bfb717342744c8c2d20e233d8ca1e8bcd1dad273a5dfb67cc5fb5a6aa -size 422726 +oid sha256:be8d1f839c20ac0404f901064a74e1edb22a9a4133f8291799321849b19038f9 +size 436392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 93f1fc3551..a660f53f43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b3f3b97bdbc378efed10abfca45057fc90ae7cb474f548b9a2d673d6c7d46f3 -size 357622 +oid sha256:70ef01714ae6a43a4ec4ea46810196cdf3c8a3fa0de254a21f86709761da3c74 +size 365788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ebf10fb7bd..17f86216f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9db70bd58f0ececebe99af2a3e5b0fd2210a61dc0e132014b29b4b82226a140 -size 643601 +oid sha256:c5332e744cc896ed0806d76a401ea1d6375d04543cdda4da6cdc39d3bb3ca756 +size 687015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index fb1bf1f335..9a6d259381 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8fa00342e3e314c30d5ac034911a9ead9d2fed9944669d7e76948ff81e55136 -size 558122 +oid sha256:1bad078994fbc0138bd20118fe1a58762e6bda82d853a12d8d57cd7494884ff2 +size 596282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 19b4b4ebf9..0485667eea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b7926a57348901fbc4ae9a491e32e6c9618ab093413ce2de5130ceba8debc6d -size 622289 +oid sha256:a797733dbbe12b8ce9e88ea899218cccc835fa9e1aa7a59ced0cb2c24b4e180d +size 648437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index cf3b47056c..b1a3585e09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e000110811695118a329708768b489af3d7d0fae31eaf177484b19a83fc8a0e -size 536810 +oid sha256:2954719c224c9f41c4d49846ec419ef65914b32f33d3549487d6c7cbfa9b218e +size 560762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f86c608a7e..35b498ddfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:468afb05624f351bc10082cbebf4bf5062e3677cea4b33261fd2c676da4e5403 -size 577350 +oid sha256:d6a1ec3025b9d8e4383aa0a5d6ab4aabb7b4722960a18a3b4570998a9d506236 +size 624981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 284940e907..6f04dabcb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e12fce20eb2f118eec01b99a83b7df1f7f64084ea4597558c00e37dc7eef5338 -size 548414 +oid sha256:6984c53944d4a207d95acc63936d535443561ae6b957be87607410896cc6ca49 +size 597648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c9b75a0382..77867b73e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c77ef0846f3259e498c84eab2ceee9d9046090ea2fde08eaaa84098f9ed6bf0 -size 564246 +oid sha256:a37fbe08db8771b4c234c2ecd1a74655fe71a821c9438eb648afbc25b68e0ca7 +size 614244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6d7d9319fd..2ade4aaffe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:232488b1d9400a6b762f1bd0e2410e4b5af32a9a246d7cd25549bb1277d81e89 -size 540070 +oid sha256:3243364468a0d44768d2144f2cb11e50ff408cb8b1a86e4c5a6150841c8042df +size 591648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8d065652c7..33b31d994c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d15bfb99f4c5c25802b3fd770d192ac901c92488bc720114696e67cd030ab17f -size 644685 +oid sha256:9a9a4821030a08be606feb8b5c581561b7287e4aad21daf5aae286596e24d8b7 +size 691255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3d0858afb9..85532bdd6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be09aac430718bb6bc7b2f781940b2eaf3ffcf82e38f74069edb9219ad481570 -size 591396 +oid sha256:e1d02f535560d7918d1da51a43a17a98713c9e9712c1af075606684efcebab62 +size 637499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7374050586..14186a7759 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d798a006359f1f26c859f4f0ec21610ed920a9219ee90e46f4d970c8ded378f -size 540896 +oid sha256:ad2341ceaccad4f99a9fa77077c73ea12d51d782ae87f2c7657b297ce3a7e0ae +size 572742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 72532d793b..d5923b0a6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faf27c3b14559d4faab6bd9966398575418d188528472671484f80757d02b4c6 -size 483712 +oid sha256:59b0d398d37e92de77f29095a0a32658ba48387a87d791313c1470a25c125eb5 +size 523450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c079764795..fa0df02455 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d3c65e042961993f48528bf6889c14930edc96711d3a8c10e7a141e59fdad21 -size 519090 +oid sha256:338c0f4bbb814dac0a15e39cbbff9039bdf869a231212d57110cc7c0eb051609 +size 552512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8427338b61..3ae66ad282 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f773f14e5eb64282f0cb20def7afca642c44b8f3d88d7de64a81ce97ebbc730e -size 462694 +oid sha256:102eb7cc918b90a7830e2c0e6931312d63618a5f4c60342a30e9dcc81a4f7fdb +size 503220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 745f21d792..7bb734daa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6eb83939ffe064c5754f001c39a7438ef3fc26b0361e89d0dd67e9f99e8b942 -size 556826 +oid sha256:cc11fc72935a9f154c0fde530c448d30c3888ce5c490073fb951fdd9bdef7535 +size 587092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c1e366c3f7..d7b293c2cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:258447d9fea24b06db34ccda1786ea522e7f6de28929ff6a3e1d528fc7ce23cf -size 527890 +oid sha256:f6942c1f3f93e68c4e7cdcdecf6c5d8dae8bba5f2349ae106fa11fc9c82df6df +size 559760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2f4ed9ad1c..4ef89fa14f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7f5507c10fb937110d5b53136a2da733ac3dae4453d206d33c3b3ba4c53a2f5 -size 542934 +oid sha256:12c9e9c5de52e5957734c58fea720e44a8a8b6727342efcd7bffdf84080f392e +size 575568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3ac1d9d91e..0f8ca5212d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d1ebccdf1fbb88bef38b3ec3ae7011d5720460dc26f9579078160b0d50b32f6 -size 518758 +oid sha256:30f31afd6adbdaf917014bbfd67d3f2f0b2c78612b43ec687aa15152034ab2b8 +size 552970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 14fd889e92..b609b34d24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f8b36db9bee30c84d485928c7868224ea6b3cfbde22c41df16e21c033090b0 -size 623373 +oid sha256:2dbd0a4fdda0f1b27752494513515ff6216cde3e6dfe1564848a4ff38f4464d3 +size 649519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2a01565457..4e0d52623d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee1938fbd4ab4bffb0481fb2ca6c275618fcca3bcc111223eaa1ee08bd80b889 -size 570084 +oid sha256:5a4479ee73e40e0fc7fdc2d9a99caaa5277e4bbc5801d73ee130c6e9d10fa4df +size 602768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1caa733da3..a40a0c1cc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7fd2571726fd4cb244aecc37947527269dea08c117b76fa557f84f8e98e31e6 -size 519584 +oid sha256:8467693421a3b6e1f35c49968dc15bb4e73b46646a9af2a00bf5e2a4027c44c3 +size 536432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 897992d079..6bfa451bf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad29a024c26c97cfca9c5fd36711d75a1b2e2ee003b6d6346f6b24011b94d026 -size 462400 +oid sha256:11e8aee556153ad65fad19236f50cb87c1b0c9c518a56612b4a7579bc177bd74 +size 486350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bf6869648a..e577a95c2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be323d5d9b2ccd7f4dbb76ee5a382e139964300bbdd43303807bfc823791e631 -size 497778 +oid sha256:d7d5362499c29096ee4e0c7d1fbdfa2c1f836ebdbfeb8092a9ef18096a054035 +size 516204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e5d8b8e49..f8fd5393f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cea6e8f1bec6a98de7ca2222e31812508ec738efdbc393996e78f4e9f60c57b -size 441382 +oid sha256:ed9a9f8b7761028007007937e2775e299670804075a4cb8cc795dcf5f49f3ed8 +size 466122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0b73adfa2e..186cd21a6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b68914d54cd117324f3fbeb96e72aaddab0c493905a6723c1a09bf59a0e4a17 -size 597906 +oid sha256:a2cc2a41306c8a19875e2951b0689ed3f92a5021b0df2e844f4461a11ff084f7 +size 640803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9e888180eb..f01ffe7a66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80c1477577eae3cbd18de1b93d924ff2389788b7a8a38b61479951939c21c52d -size 568180 +oid sha256:8009feb523958aca6a46e40d7d3e2ba7b6a4b773941ec4c1c8b17583948f33bf +size 611890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dabfa886e1..0ee13ce194 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef887adc13a228174a68d2b216ed5af26d2a0f02a815b040d22f885a750ef9cb -size 584802 +oid sha256:4e6b358b90095f24f7679b8c4a0ed3a197c532b36999c4a27a9e490b6d665775 +size 629277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cf9c83cff4..4f80a72540 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:079be02277fe6e79eed76ae5de3db2b824633e01ef4c92000cc2ce2b3c2ecb10 -size 559048 +oid sha256:d42910094ae7245c01f2a1735bdfa394a63e0ac57915871e27b43345cf460ace +size 605100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index daa3c29f2b..62c96bec0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e5eafb627cc20d2478f1ecfbf9281bed9557296198150c46c636138819df402 -size 680831 +oid sha256:f33752bd17738a5a5651368efab153d542301fb864e87dfbbf3ba7df0ef89b0c +size 749009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7da50d2bbb..576b63cab6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71581792b78acf65fbac95c4fbfa91dcb56c3722245fcac43723d81eb6f05f40 -size 596142 +oid sha256:9de61de96614d58c1d8c3a9c72ab3e355fc59caefe1474dc2bd60c82f340c163 +size 633315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5232eadb74..5258c2587c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcf550b19440f807eaedfe3d2f5612843c15aa53efeb698e02c079e956685035 -size 564512 +oid sha256:495bd3ebcdf6f6f2270ab5c10825517b065303dde45553f10b65ab333a740126 +size 606594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5317b26148..e25386cfbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daf06a7d096b36abf96c3709d7e27c5dbcaef0fdef55bc2c687bda58037f2360 -size 491516 +oid sha256:8396e31640be20e49854d7e061dd9aa451516796c4d1b8b4236186a26f258240 +size 523360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c1f215fc2d..dcdb58a2c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8892c0099e53f8788c95e413d00fc0151fdd6399e5b3422ad798983b6831271 -size 538758 +oid sha256:04ad51d8836ce6cdf64bf267731bc07b340a205f3bcf6ccb7842085042151396 +size 581628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eedf7c300a..6bd0838136 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79d6aa742973be1f54491256d3b1d328c4700e958e61bc48ec3f9c86beb99503 -size 469708 +oid sha256:6fbdb691bb14f6b7bd7bc0188171cb14155f0be066cda3687ec7e0947d633157 +size 503920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d664083b25..3e6c056a7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:024d03446d8b060ec5cb5743fe25b9564488a06d9a169fc50d3a540ef3a68c45 -size 577384 +oid sha256:e4dcbb1fd7db30c312e7ff37754e87b7f022b52973359092dd6fafe05767b0b5 +size 602124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fba4a9160a..22c6f0e114 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e0aa7f3c05928c187d44ae19258969c6305b3d9727aecee50df43fda9ba64c2 -size 546868 +oid sha256:b89e910fce8afe4a1650259e5628c2c79535fdfadde7063a3c4cc36ba98e6dae +size 573212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fb980b7744..7e695e0394 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec7cf2baf07aba0d19c90cb99205d59b25ad9eccbdde83f4fcd912c992c8ddd3 -size 563490 +oid sha256:b416a3295f7fa46b5ef9cf3b2fddbfdb924e191eb6758ecd722b35805d59d2fa +size 590598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0b4667d36a..d407ebe47e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4a92a9f1363fcf04df4663cd5e1da2882531d3a7d604fde2ef49fb4c7ae64f9 -size 538526 +oid sha256:61c360e55f8a5dd89601bec6a1047b3c1d7269cd4980d34790277d5291479897 +size 566424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index acbfc2dbc8..4f940146a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2abebdb601aec5de59597c60734337231893988e5321c8bbbda1fd98c16e7c4c -size 659519 +oid sha256:267e6057b6bd4fe3a94adc24dbe0495388976874a6397c7438365b9482973219 +size 707273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 313d8e78bc..a529a0bcdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26ae99eb517b8e032f75f35bce06348c528f75797e6b7a72e3089239e307ac48 -size 574830 +oid sha256:56b034f9974955e092c407ed23bbf427593b43d2c5a4de352d1b5fe2d1c13b01 +size 597004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 714916c218..09828e0fd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7890a9b7b9f9be2ec568fd9ab329716b883a4980a52ac4d8c7f0cf0c7c701351 -size 541622 +oid sha256:2cbd4199b9a5929907cc48e68ac53cd0c16cd3e30d4a2d8bce8b4bb7dd722755 +size 566338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 83c6068423..cd998ce097 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96b61bac1e9df321b4589dcbaa91dbc9452d1edc37eb0191f051092d12580e71 -size 470204 +oid sha256:80b25b6631be05309453f6fea1c94f03afa22ac794966406958d5ef0c54d5947 +size 487840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1d791f576d..88c2340b53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21847047d1bb26303f3ff65bdcefc0bc293022aa7f33f0ef225669e382969ee2 -size 515868 +oid sha256:d13c8a5ff0a64d243d633c04bd9feb6c43e8369fd0a748b949dbcf09e6acb798 +size 542162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 47909c4ed4..754738c0e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7951d661d2091fbf0edd90d4f69f6e9bb188f250763e0d101bd3bdeb7da3836c -size 448396 +oid sha256:ebf364a63f6b8b18bee09a555fb9a80e80768862c21ff11e540ec7649a15b592 +size 468400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index abb0dd5d02..cb973cc15b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f607c61775edfcf3f47b485f63257fbce88fe142f7ec181c6120cae1020ebcea -size 546412 +oid sha256:af793d48315c7f635641e4977269c2220c90210d3e4c06c85f170f9b3a263109 +size 581660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 50f6cd1fc0..fe63d06ba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4550c99b127d6e5cce91a0aada71698c1d53d22c9369ca869fe78ea8a9db5c06 -size 489004 +oid sha256:a5575d32f19de2bc79e0019c460dd9cfdacadab04f3f7d1423795e596e106cf4 +size 521096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b844befc0..2e5cfdc644 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:104f983b6c784ae4dc350002ba2041a5a611f35b894e9559908237e27bf5485d -size 535362 +oid sha256:81af6008028a6b1182d5e9919d2aa7bcfe1a70184e51487994f993c816d010ac +size 560818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 94b2efa9d6..5a6521a354 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec11f44c73f8a0c30e17c4f38e0ca83ca2caf35930a01916c0a37dd5f4bb2866 -size 477954 +oid sha256:6d9900baa4d38bb1a962febc14c5748ed693603e0b9e3d5353d07f8b248f0d97 +size 503730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 321f573a58..d511f3c5d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d2dd9c8691ac39d863ec6a03949701726e37a020d7d309953608e91b219c57c -size 440226 +oid sha256:12fe0a99bb49acc6effaf982ac94cca4ef8d63e96e6e8bfb0b9ffba31b24d2a2 +size 464942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 848134aa86..0add7d454b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18e44860c35a0e70ab5d251842be62db55f6c0ae1a0d1d56a4d1242a3e589ffe -size 422118 +oid sha256:545a970d35d1a1b72fe6110e82e072ba1ebe1286b9dfcce53665e8a054171dfc +size 449252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1f23599fd0..b78c6b9ef2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b92cf1407bf7d4a412a33261e752095afcd73db7ebec181db8eb870742470cc -size 434990 +oid sha256:b1278ea9908cce2b4095e8f2312aa97d2b94eaec477457203013e75855a77b3f +size 459730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 56ba3cb57e..594638daf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11627a586e9192c7b98d6cb5d7e5d0e0523466e93727cdc44b98323f0919d100 -size 416932 +oid sha256:f291d926e676e4c8dbfe11171ba85931cf743634693d77393fab74c2cf5b266a +size 444040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 632ef8d2a9..96f7c0b105 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48d2d8411beef49b0505b1ad3d9abf8cc20735d581d9c77255df1d61646d5da4 -size 542464 +oid sha256:8188f3cbd205d1122c1d2e1cffef8d6b35c599c39e47a47e7642f9288ed31098 +size 574234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f36746d3f0..95432f958b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccb3c33ffa5228d40a13c5e7dff78af575091d80797cad71ee3e8a2e44e5ca8c -size 488510 +oid sha256:58e60b3efd13ad6926596b6a474d6a4b3d379668b191a972e3252599d90ea328 +size 517222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c18312e668..1871e2dd81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa9d59793942531b1625395db781c0f4acba9cf3126a6cc51009c94e667249e0 -size 424270 +oid sha256:547233ce34185393dd425eb4aa164b6712966b891afc8e16a22d40fc47cdae1f +size 434804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b943099b73..656283ab04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07cb79b0f19b0daeead6b66ff4a7d7a9a0afe3c8792f592d20ff7ecb782a289d -size 367086 +oid sha256:40d0a8a97b1443bcf6a358cbc25d44b03ed9e8921cf6b082d39be366ebcb14ad +size 387090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fa94728610..0b01b6c825 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3b20d866a20e7d520b0bb2f94e64558bab22fb20e624d0a6a130e933ec53d29 -size 403844 +oid sha256:c5a094a9d6486742775134bf334f2f70537afcd2bdb0ef78a58876536230faa7 +size 416746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c5190b994e..37f1683140 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58773456ab84b701a85f0d6b8cf124399caa13e8d8999eb0ea33cb5c1a862165 -size 347448 +oid sha256:1b1ec662cae5795eb7c37bb8ac9c33c08babf6988794017d34dba4fa8816e5b3 +size 369822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c85dc5aa13..bb4d1e2d38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd834a9233128b8aa2748ddcf5aa1b5d40d5c0dfdb8d36c7189b5a9ddb4f6a74 -size 429964 +oid sha256:9a75cbce99c166aa84409a834e319ba1697bbceedf8291188edd085f3fcce8cc +size 448366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e8156ce60d..e5207fe35b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d83f7c1d260828c770337d2d82a5737cb036e5245b35de3bd72775b51a8e646e -size 411068 +oid sha256:dfbb8f14340e5f86493bb6846c68f8a8065efd368cd819307ec7640bd16976c6 +size 432676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0a98b22ac1..66f783f48b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd5c82c6ace95f20e192889242ee5fd97e5110c30403cbf9eb565a3fdfcfa9e1 -size 423940 +oid sha256:34bd2498c215f9da55e02dcf22eb12ccc6e0617f68a8bad279936743b661a80e +size 443154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e0a947a7ff..54eeb7a733 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15f1f68f21b5c4864304e85785d297519278d025de9ab4d16537ea674f82788e -size 405882 +oid sha256:d0d3474acc6c1d17294bd5ece4a054e13646fe0507b98a1fb2d3f780fa4abb88 +size 427464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7ca6dbdff2..dacef2b2ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1128a62a4d31b6d238c4fa736da158556e57ca7a58435207279beebe6cb292a1 -size 530624 +oid sha256:383094d00fa1821275c1fea4eb7ab3bab6a6c5a1e710c98c333009a61d7de391 +size 556868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 30585b4a91..d091dc4098 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09142340b328fcb68317406a0e9481c3190b64ac4892de355c91643d898fddd6 -size 477458 +oid sha256:05b308084b5a9bfdea183366288e294757374f319390ea7edd20648b0a35ac5c +size 502224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3887038196..a5b14cd45a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6bdd27c0d5e288916d0cb0ae9c0d0499221cb7656c361279cf4b012e535e5cd -size 413220 +oid sha256:5edabd6b2fa6c65fda61dd2b5e3c50070fe81837f0338d886a371d1cadd737e0 +size 419016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d6fc07a2d..ae061735a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6464a76a1341af0760acd2cb7cb9cb8f292d16c608ce983e0c12bded5f3e74d4 -size 356034 +oid sha256:9ab08b11094ff142cd6d857fb808bd1c58c59b4777b94a2fe8767b76b91ad006 +size 369724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 99f0f274db..ba9caf8a29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9b7f34e656d005533a860401ea93b5b6a7187243ae307843e212b5f38bb5698 -size 392004 +oid sha256:3b0b998d67069ee8880ed108c11e3e813ec510a3e837c644e495cd22bd975bfc +size 400170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e8ca300c33..fcb4be1a1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e21c44f85c8d8bf0f528dfc735151ab632337ac21766ab7296ebfebc43de68c -size 336398 +oid sha256:524cf0df7e9234d7d26d1a125d0b2c4c3bd6bbd33eb0f23bb43d41190c6bafe0 +size 352456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d5ef74de6d..685f536911 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d7ae099f2a7cf348874977fa3c19bc7b8c60c5ca632846d581ed68c3f019d07 -size 460782 +oid sha256:9af89f03ab38960e203628e8bdba567ff950e09d2e0f078524684175435d2f12 +size 479184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6277545765..3f3c13ac61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:887e0cf2e483e6f877377095038434bbea75a30a125fbbbbb6bae880abad7813 -size 441096 +oid sha256:43365d8a2e13005ca234071f77c66f1627e48d430cea54d1d48fdd00bdb16880 +size 461126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 68e3248ebf..8879c0b115 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:051899c7a14da745a2b534e484f5d95529a4609bbb24ee988c09f4b9c85a2b75 -size 454758 +oid sha256:d3cd74319646a5964772dcb4e7fae25e5ae1275b227cb2e4269ae76cde5b5720 +size 473184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d662b28797..bccf2a4126 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5e1d3c386e5ffb1657e9e383022d7085ac256e3dd1ac462b577e26b8da09f38 -size 435910 +oid sha256:b3427c255a7976d421abb8a4763cd6560705a19783917ac403630b45044a2648 +size 456704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0953e00918..06cb54fa6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3bb7ec44cf77174a61ecdf88b4217677e839f0c7639dbfb4ddac65d701414c1 -size 586010 +oid sha256:c7d35aa53e2232dd9032832f64da8f6cfdd9cbf1161d269d441218179b6f6879 +size 635467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1e7570d5f3..31bf23aee8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9ae0c184a2f6aadc212bed9c55de5e08aaa4ef6fe7266506aac33026d1bbf69 -size 528084 +oid sha256:172b23bc5772ecfbab29e52452ce1889bb0d51e5505df991e3e73dd2844e4660 +size 573322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d1d4f805aa..ac610ae55a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d48f8a923fb2792bd6a76f777a58d89e15d8f8cee3f2b8a76382ac0ce060de34 -size 447886 +oid sha256:d4b5ce991829df0b1307a7d1c0c9bbfc1f6d2af8240a2bbe7f06c1a0aab4620a +size 469444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2f3d2ca1b7..f5b3ffec70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c344a297ed93cefef2b9c383eb09d79e26b70345b9784f07758ad322996b4c12 -size 378046 +oid sha256:c214e3e1fac44961dc0118aa1a4aaede7c70f7553e4f3561057043b550fe41f0 +size 387000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index eef777c1f3..35d9889138 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2d02461d3685f23a33ba9e8f7ee0ea56e7a9522b43ca7dd3eab229f39531a99 -size 424302 +oid sha256:7fc0ce326ed7572fbae58ba4e578c5eb56e892a3f7a3dc8d337cd6066e9e9502 +size 445072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e10cb82814..439ba9b310 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0340900f165600e00a29b2a50cf0929369c5a730d122251816859590e9989b78 -size 356832 +oid sha256:3d4436521aaf2894cb0f421d91ed3f1dd82e18bd8fe6bfe7d43de69872e72ef3 +size 368154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 57f517e6a7..896adec55d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19bacc7e90c7545799584fc518a5000358a777cdda48f2df2b3b2ad86e19fe1a -size 449732 +oid sha256:ada999a68a8cb5087440e36e4dd0372d8aeb9282630c20b5487d035ea1421e90 +size 462608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a411d65e9f..fd58dc714e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa3d22c3085847a1df031379177b9a456515c590f2854c95ba22c76aea549158 -size 430046 +oid sha256:46f890eff48e6a5fbd72de3e81c6f835656195e0572cb8e16e7f6c1f7eaff173 +size 444550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d630867b22..fb532c9754 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20602da1e43c2c1f44566c0de1854322fffb1fdc51c08223c3b5948d4062f94a -size 443706 +oid sha256:60da2c533012bbeb70f3c73457d5e3efcea603a4b735ee2da16bb44d9a1410e0 +size 456608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9e7cb25537..78c796f0b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36bd8373c35e11bcf8596429257a3d32756d003620d5e381d0982d56a08ff449 -size 425648 +oid sha256:1092e4545dd09b29a57d16d98ae4c2075c6d7810dbb9127ff7341abf5240449f +size 440128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d73b8d4c5b..ed40463430 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:460f55b9356a49ddef88cccba00405c65e7f9da766597551aecd281061d1487c -size 574958 +oid sha256:b72bfa8469af7276ee6611ac085b1ce7d94b31c0684f8e97aed8833803924828 +size 615412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4cb68a96bb..77f14a561a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b642a5560e0b5fb0e1f855e017a7732b45d3e5ec160a71c75f45426d9eaf51b8 -size 517034 +oid sha256:46d751177362348e4e86cfd4783ecb66dc1f7371e790a4654f2262b77b46e3c5 +size 556746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ed3599bf4d..616b09493d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc67e2c11a2d3a5bec8c3a8b75f23222943215db1182b38b582b000fca4abcbe -size 435256 +oid sha256:dfcce29cce8009fa9b3d96126dcb9bb1c428042cff52854f433ef37d125f2152 +size 448132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1835b40bb9..e8213ca8ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7700dda59dc2d1229e0a465122cf49baaaa1dbea6c8fb4c6f52c6f959adad71e -size 366996 +oid sha256:2ef6e98e4a71a5435704d476810b1ddbbcae73c95a0f75d2dd351cb468ff3d5b +size 371214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bb3e084f29..b83629c49a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f23b7ee254c170c8099bddb777fffc143a1b51d6402b085e3a9c3d0759dd02b7 -size 410884 +oid sha256:543a3dfce402841795fbe4394f1b774a62d5a23875664a0b21a51db04e6c231e +size 423760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ea47c27c17..2b1d9b4834 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3d44a032f36ca85ff0c86daeb474e59c996da40e85bd9611f8ef606c8f1a8c7 -size 345780 +oid sha256:36c0d7b4903eeb739b54e5ba81a4fa0bcd076b9989a6652f158b6fe75bc231bc +size 352366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 855cd6cc04..1f1f79dba0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:804110850cdcec65467364ccc6eb4451676d49b4ef90edbf31c3b8d1965682bc -size 711367 +oid sha256:031daa2bd1515701fec5974a73626e32c2f5027a1ce6a451232e6cc0950a29ec +size 696665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 81382becf2..7b08b429b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55c1452ad7a0b7e52b4b1fab46362acda2541fb9ab7815bddc3cf023a4e016d8 -size 623397 +oid sha256:14df302b56d4cd425ae2036671e47f458b56838192a5bb1c492537754754c9d2 +size 611360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 96f424ca88..8c0aef6b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b6a44dc167aad9761dbaf8bd40c5d8ee559a49e5434dd1be4b118f149ef94cd -size 708947 +oid sha256:fd48b63096ea2c7bb2e735e7bda43690f44a827e9db75a7e9ee019cf3f7d942a +size 692865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 025b135d3a..84c4bab8ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4a5269de77c048079a5f1f0db202a0fa9563c632fc25fa54f6cd3e29dbf7171 -size 624283 +oid sha256:811ee09d7b631a0c23b7bc60af9ead0fdf725282bf1636c09a8a45deb2151e31 +size 610124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 38feeaa200..fd9cf1cfe9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cee8e891d78a0af738a5cab2dcc2940000be44a65151f2a5b60b4cb55b014b5c -size 777605 +oid sha256:6a36ac1eca714524566a12c269c087d77d569d5fe5301c2037453121564015cb +size 763889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index fdaabb5f14..f53d413540 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31978f2381f7d64d899e02b189fe8038577bb42976223cfbe19d9956465962b2 -size 691805 +oid sha256:222aef7c69026bea35821c2c1dc29359faf0cd7be0a9eefca1f8a4e34eaa7e65 +size 679621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3495477d13..5c233bd708 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0d3a72762fd3d2475bdb000fabb11cf0183e7b5915e5209c4b77ad78715e7e2 -size 811515 +oid sha256:263da40cf8662e1c829562d9bc1f52820029dbc23ba95fd0c5bef54573a69b3a +size 805201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 63935c18a2..50bae647a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c19363eb32d2400db250d4dcadd1639632e8eabd8fed35bf816e6daaabf4cd46 -size 714815 +oid sha256:e6d2f405a52e558112ee087c5aad5d00b8057f533c25315ceac797d93880cee7 +size 713779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9502158bff..233ebe8c2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4e11f8083004eb096f3f4c41600c32c00a51b7c13e940ef46433e13e34b9f05 -size 797999 +oid sha256:60caf63aca97b81c56deccd583482353fa4aebff720cc0e97198acd87ca8c163 +size 783691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 83614a741b..c14ac2a269 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54938d4c1993c197ebe62d1d2001c95758cd3f23195d053e82c3812a603a219a -size 701297 +oid sha256:8e42ac68aac8368945c27963161743ca76d26c198991d887e1396b2b8f39ae62 +size 690887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 267ee23807..706d1ffc04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6afc1cda7605e0e29c90c74993cba32801b0de94353e4235af294c99ebf5b306 -size 806827 +oid sha256:2d3ba8359a2e6ca7a7b331fe5529f81bfa368be425c5d81ab21bacd0bf2bf603 +size 797405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 84a27ac27c..dcdaef7f20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b53cf6a19d213b69c2759386ba4c2ffe6f6874b7118090a7f277ed53584bd6a -size 715503 +oid sha256:275da2678929139b580dd638cf537034999d4c7eaf7190181b128eb4df34abb4 +size 712889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 47b725d1b6..135f895199 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2eba1a0192aaea67c44510b35e5ffe5730caa82a2f9b78ff156ac1b83c5f7ba -size 792619 +oid sha256:41ae2d10098b9846d7aff50140fa11fe4a87928554279c3ac143a568052be5da +size 777967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 56b458c673..1d306f5298 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8589f0ae1daba58eac6b6431e8d72424298d5622c3bed9430246efa64fa95a1b -size 701197 +oid sha256:e63febd9ef332f30fdcd3339e9629e41ac1eb9a572dcc6340a278cdcc5d9d082 +size 691083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8d030aaf0b..862f08678e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab47c4b41dcdb3f52ee2eab4bc68bf2828e18f1e5afe946aac96e8fca08087fe -size 881749 +oid sha256:0caf4347212780c0389e54e456721e307eb98cfdc47b8ea433e4d48b2acfd22d +size 875533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 678cf4ca21..c8a884f591 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2efca32cedc22eea1bd558a85cf02921f5c1b03ddc87d7f64a7554aa542a98f6 -size 784999 +oid sha256:77121865c2a6c48ce32e07aecdc8b2cf2d36767527ff236bba4fedcf4a344bff +size 782483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5c9f8f70b2..6c83a9d3b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76e8db79ce9027897c95c6ab8d66e8efdab01d7b5a3acfddfad38653a89b5b1b -size 868281 +oid sha256:4a7e7466e31fb25516efe24aaeb8cbe8d8214cc6cb5faa342cf55692f9ceefac +size 854023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 7c6313d205..512c93324a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:593849bb4e1ad8ae8d6ff74f56e7e24a586c227b2d49b358c40452cd7141ed61 -size 770741 +oid sha256:210c18c8bd61934e64b0360dffd50be4836706b990298686e9173734e91027a0 +size 759543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c098f57b8b..761ffb918d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6af6026d1b6978809196f92d94e89443852e95560c9b9317935b53fcaaeda3ad -size 645705 +oid sha256:849770296445c7561b97a7b1c88fa1906548d016b71641e94f200906b2921407 +size 658433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 79d9054742..819b8c7d3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97aaf3840d2a5e87140160465479d9791cb968633fb79031e0aaa0e53ed7b2d5 -size 547768 +oid sha256:cf870ff3c53b7f243d175378468ec713626ecf2b10b74da7a86c8a0e4cd0a731 +size 555810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 17b436cd31..7a1327a357 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:075282fc574b4b9fe9cda91dfd168f4ad966e4e69a957e1097fa406fd5c6e300 -size 643187 +oid sha256:2ee16264d45e8dbfe41b3ff219b4b9e86a50ca4a17f41d2910c87a6e02d4791b +size 655767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9470cd2541..3dfb7690e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7389159398ee1fb73662efd775c3784b99e4b2def7558a18e6b60fd5166a9a65 -size 561580 +oid sha256:d4f8f6806473a97134bf99a523e11244e8784f1ad6bee8bab33462575e40493f +size 574506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 912d850076..2329f33b23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b14b5733f0fa49c32a1d0e050c296a027cf61f7cfdcc32fe28fee5e19d607d22 -size 712879 +oid sha256:cd71af5563ab1fe0c2f7359c0323cec9c6e604ba57f6e3759671af008545b471 +size 724127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 13ffa91557..f37a3ec390 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe5ca895ddbe2658d739287bb747fba36154931af230da966ac55092c20dd48d -size 616028 +oid sha256:37d0ac343968ca5475e30f491c07dcf608f33f2dde8a18239dac1c7c3bc9e632 +size 621703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7dee5a6516..2af2fbbc2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48383c75a59ff494337eba35f85a0ee5499245b2ad809d61b3db2ee17b942002 -size 763859 +oid sha256:63606a3ed9b7b25350517eb02d684f0b37790fd55dba03b17fb05ff2207f43ef +size 801551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 396456bb2f..542e853f57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:779eed8da4ef730609b37b2c5f7ae7e80d036717d9f9ff7578764f1691810f03 -size 657143 +oid sha256:4159c09b75b723f4798905a0a7927b122d49bb544797df2a131651780981c6c8 +size 680331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2a58c21ea6..7bf51f6388 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6efeb679d7e776cb6509ed9f969c575a06757b8f590d901f0a0097d26d1b31f2 -size 736035 +oid sha256:21ddf03db65e23b7de2104b7049fc5cac970ad6fba44a53285489116faede2bb +size 748073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3c25bb0556..edf7720a32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e516892fd8d94a877d7e522a1245e3d886c80a2663485bee58acf5231ccfbe0 -size 630109 +oid sha256:fbc0e982735c1b8c78d0e0d7c74b2ce74d86d5b79e967dcddd72f9f3bc2536dc +size 642787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 55b2e34b39..6156873d1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6454f7830809b5a90b95cb721e7380b6d34eadf362c1aecc86c94a27a3f28e2 -size 757691 +oid sha256:341de8004cf8a559929fdc1837a66b3fdf45eaa4ad63b2fc373b6ab4c3f4f976 +size 804311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 266d51e488..f159a4a388 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:854916a043f4dbd2c1943cddb7e337d2d7dc15f9a572958df1390695990c9eb1 -size 672237 +oid sha256:062ff62c80c956621cafe06523b778392efa33fbcf470438ee690f6b5bfcc5cc +size 709829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 67833f9e8a..c530d13cab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:144179855a572f6c80d98250f1fe57eaba6a77f337d1c09a80322e3616356459 -size 729917 +oid sha256:179bc4f2ce0dde59e7ce91cd9866dc83a8fca4570f1e383130e57e51b2c784e9 +size 749205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 25d6f86186..c3a8660cef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60f5fae056d9bde564424995b73351471458086975d5d7ab1b81d17555717aec -size 644463 +oid sha256:a1adf1cc86bea530d7cf5620881c32ef7029cf657ad125b389383d3902911be9 +size 665429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 75b60f0e15..c1c0f726db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c58f0ad79c3554d598ffb065d93843fdded072589fc19cd2c49289f441a9b7bd -size 827039 +oid sha256:d5cc0867b614d5a4ad0f1c425b5700ec885fd14006efdbe85f319e88ce7f7246 +size 873659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7c24dda353..4768927159 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10971246447c2424b91ea2e1c1cb52280773e178af1f6fdc8ca97069165b581e -size 724515 +oid sha256:9723755e14f30aa7e807cd059e5e989efc6af8118c97f1a4f1b2b0292b5aab79 +size 748295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 18f2194101..d43402b19c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:884048501e9322995404cf6f1527b4f6da4d72418678ba012a4c91ed3d9db068 -size 799263 +oid sha256:8eab5a89593e25739e3777f2dc95dabff7829da9ee62555bb03185c2664a8a58 +size 820427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index bb9e96c4db..e29ff32414 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:369aed24cf15e036abec3b6aadc27de5e0dbc4a339ce7a1132e2fbaa04416063 -size 697531 +oid sha256:bc822b1911ac07b1cf4496647283472410e7bfa74e89196c643bf00bf7fe5f9b +size 710751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 036f2604c8..2b6fa6d4a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a682cc9596b42b629a004545afaa2edbc5e0ed1987c0ecea1bc1131ee93570ed -size 646837 +oid sha256:bb1a392dd02812af41e9e498b2179c51514d0fbe90cc8d584f1b3f24fefb4d47 +size 666571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 31c9b16361..a6bffb7239 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1917100a32cd74153d9cc857eef03af4317df62611dff17618064ef204a0e3e7 -size 563110 +oid sha256:d46224d108f723d8780b7003ec0447c20a59c5cea117947555c13522ee9efec1 +size 576232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ef4a9a5f2a..59d61af494 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de56d155416ca0a8cc32853dbdf3e6bdc67478bebe3ddfbdc77498aab9ab15c3 -size 644467 +oid sha256:7df605326f91d9ee1788dab7da0d2d3b48634d1034387158e7a465e76f842e72 +size 661339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 49c6f121e4..bdb367c7e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63a3055203eee0d2630f256b6f978d78a567e091e5feb30a9c0acf35d514d006 -size 562812 +oid sha256:b7ce50c0c4f2edb3c0f063723124d6436c9dc51a2d04610b6279af86ddb0e238 +size 574356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f4686f01a0..5893cc9d08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afb8a310b54db9b803c7cd14277f37b0b7511e0179d7ed36f08f97f906c83ab0 -size 713863 +oid sha256:0a9f35980b8e8235602c7c228d875607e5df447a0fe67a4921aa7acd24a82e53 +size 732955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c7aca01615..9373765413 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c651df56d13922e07734298331442f2cd8d4fa23b371c0239389b287d08dd7d6 -size 632011 +oid sha256:f33f889626ba443993d71524af86ebb4754df46b7f59593f294e5b2fe0d1e2c4 +size 644099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 10cc8a22df..5ae0e43739 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2740d5517d60d7de1dfee9e35fd075fac962cb080d0fa06df205e929eca0eca5 -size 737465 +oid sha256:9ba21218786718760d2cb256e8fa102cb849e8b0082b5c3f73b537cd2128ab42 +size 758185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e1be9946f7..c2aca0b604 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3abe40c86b6af5f3484c9396a79e5dd5b002f9410fb84b340a30535996c6c922 -size 643081 +oid sha256:df41ac09a07267980e9b8b70e89991e8782843de9223856fdfaf1f8d03431406 +size 663357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2eed5cb71f..b69d549e56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f4ff88fe6615d1f6f1c388e23364a45ea33b004c64b317a146d61043668b262 -size 730311 +oid sha256:2eaf47abd568dad8952c1bc8ea4790446eef76a09b8dee41e14139fa69934e79 +size 748465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b122367996..df5f7e5ee7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36f5a18dbe9dc32f28af7558805b0e1d3f989eed4c86e735758830b37d0ee75b -size 635929 +oid sha256:9b29f36529e24b2421b5e059728da7fe8a351a5e63907d23fe26fa815d784c13 +size 654527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 637c744ee7..03e21a686a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9218344b6dc1f1cd50917b63409317079fb7521f4a42583ea4ccb5d4cc74738 -size 726955 +oid sha256:6dee64b452924167c5618deebb0288cb0abc6b9ce9b1b3de306b5c3e8f0902ad +size 752163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3c6fc7809a..cb06970ba5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4ae46ed1731bd43d10ac90b90673d2bb099d60bc79fd42468d7968dfaaeb875 -size 642883 +oid sha256:05f6d16f6013c48ee2524dd58f04f8303db457d4f5f6ce7dda627d6c6898ddfc +size 661629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 783d06d995..f22cfca820 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36748f31ebdf23ccbb5885f6a87d8b77a24a1f14cd24aa417315a04fd4d59093 -size 719899 +oid sha256:ceb43f8aba6d87ff849a5313fb819b56394c3cd1d1fcffc6d48542a0c263b96d +size 742347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index bb0e7ec925..9aaccd70bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c8510edde0cfada8d26666c1f0eff9767528106043d65d4164b71df81e60d3e -size 635779 +oid sha256:dd57cb8a2d1e767bdfbcf6fba9a481cd0d6a094faf1c0336b20d3cb9b2e9bb60 +size 652799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index adbe5e2b07..8a435c80e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b30f96eee241c4ae70e41c34a999a50e3667efb27d23816f2bbc31dba4edf3a -size 806465 +oid sha256:b2c25ed158b6e60369a4ab9eb1e042708a5c6c45e615d888268ced19ede544d9 +size 828813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1ce8cf1fbe..4257e7b33d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6f4d29abebffb8c8988173ceb174669cd875f09094bd3dbd702748007008e9d -size 713611 +oid sha256:cf9cf5db499df335b0923570d3852532262615d2125f24d8d7910a3d8dc44e71 +size 731371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9cb91d9767..4c11319dc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:139494e09ecbb1e09b53984d363c9a5eb3a785ca42b76e1cfc8f21336b6b37da -size 799311 +oid sha256:10a7dd5e5732b2822de4d6626f2108c8513291682593474c990ac7845c2aded7 +size 819093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a32698f1f5..04823ab417 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3f2a0ac5dfeccedef4d35b67cd6b4b0c81876fe324d213f8826d28c03a236dc -size 706457 +oid sha256:96df450c57c935cdd3b9b5407e2c48b328c387ef1ba7b66ee9f858ba381db5ab +size 721751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 152e2a100f..80d6cb6b9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a467d8291efa037a5ee3db3e70058174df4f941b20e8f35d0950f31a30e147e2 -size 721001 +oid sha256:fabe7ade0d6c6accdbffbdaf368f184606f6ecbfaba2fde3a6e690d8e67e4dc8 +size 731903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d4e4139ca9..df5a017b77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d49b52cfc3b40bd6837d291514d7c574401b191efc06e23d0d89ca58fa3f4d0 -size 630811 +oid sha256:83a2c24f8cdc4157acc8674b9a917fa97e37937a1be3738a03103c48e9305780 +size 640185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 297df0549c..075cc7ac73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d55a8c572f892ef85cd611f015dd978d0e6ff5b1755477b6ccc5b71a9f767e8 -size 719519 +oid sha256:58acf192ec228da3a66c7195e8205b1078aabe4c5195c1a6a2a9698e0fa623fd +size 728103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 41de2eea3b..8497c691df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d08d25174c0c75fe700eb7f2fb8eaa0c2306d4ce0a7fb4d834be5a5eb938572 -size 633029 +oid sha256:d08d1d5adea95a9a4b236c5a322ffd869e6bba6388560dd4a5503e3ce010460b +size 648669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c0a648493b..6da69f366c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2063cfb645381619514aa58aa5a98e10695afe78667f8cf04e56246b147cc046 -size 829487 +oid sha256:2ae997195a54cb2ae499050130273d896c7a06d204857bf1ec5304d5601bd870 +size 852477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 35e911033b..43a9983718 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:179a0a53d23351ea72d93c84c32be30079b18b458af2c60d252bdf646a988b5d -size 734019 +oid sha256:18f412b8f66b6f1bbcacf15b47ad82d913388308538cca1eca089d29a2eb5fc4 +size 765593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1b678d52b7..ad834a76ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26f0c96f5f10828f583451acb13ee3d32dcd195275a1603572fe538c83f7106f -size 806449 +oid sha256:2005d9ae482d86f71a44b208063f277e15ed0bd2c854281a01900faec6efd749 +size 822581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 105c722d7c..867dea4d49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f340ab15bbb03f8ea2b5ec02d65bf85df97b407840cbeeb7beabe2503f319226 -size 709599 +oid sha256:d6113f3b7e0e1397355a8ca9812909f0ea68b4f9d36fcc0956ecc35af062a6d1 +size 725583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fbba936900..7d6e578bc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ac1b023069d6f789da8697f05e1d286a142b337d59e9418e34c3de89c75c666 -size 827463 +oid sha256:d096cba7b9b0f10ce12b168cd4fe7b0d8c4988f784d2bdaa27bc687d6b31b3bc +size 847343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 80c87eba41..b7e39c63c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b245ed24864772e6657c1eced824f20dd45fcc4774bd75f207efaa7baeac082c -size 752961 +oid sha256:c2fe81a8fcf7957bf7d4f89fcda40e436ba885225bfb5e19fa82be60fbc82aca +size 771807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 484fdcff56..6ac89fe47e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:703102b03f63def53d2957246d6bd55b65a715d5b0be349a26dc861542909f0f -size 806891 +oid sha256:f5b1f2fb539e0bf8a54041ac64e0a3a8e2e1023d58bee758810f3b19bc14655b +size 815179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index e36388fa8c..96969444a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59bfcce86ad4da128485a4a87aee0c820a6b0333188e0e6b8e9cdd5406034bb0 -size 721437 +oid sha256:e6fbb7833d3d53e1d590738f3c694622fc8e8ea9148beb56737de0c5a509200a +size 736779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ab3c3c4933..eaa235e070 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb6d6b093d7c99cfb2ec7e8f0dc9fd769202842818d147e8554c818d1586cb92 -size 721795 +oid sha256:99b5137f9c0da7cddffc021c23b4990aa89d76500e934e7672c87ea460a0c009 +size 732697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 01c609be74..7c8b43906e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70b892aae43e722b43f8658f185ae28fe0b4de56721512c1cbd37d48c4a9dc73 -size 631605 +oid sha256:2ddcaca9d2ecae30d3fbe7caabe95002f9b8d3e31e006f14c40ee7f25b9627fe +size 640979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f4ec64b76c..8e716ddb4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d49fdb2ee49e3555fa56db74e5837ee22f3752f5301eb395fefa6df2f4138a24 -size 719523 +oid sha256:6d34111162dc390c60b1796f3c8364cd76cad8a5a3a653d73c87c15fe64c8fd7 +size 728897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index c3c892d61d..33ffead0f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31ff4082e37e28321a4858789e601b997be42eca11f7db296642495d0f2ea5c3 -size 633823 +oid sha256:d67be78d25d25ccf4157d274a4248d0ffd82b6ffb3bcd913db0af6f44c5f07b2 +size 648673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 03adca552a..a29945c934 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a9fdf6380c357e04d0183da22d592cd0da9639a411c8939a710cbc71277d75e -size 655119 +oid sha256:36a1be13d930dc092462fb675fe75be97a59b0352beca12700adb78c3deebac6 +size 662125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e776f566b6..b8e06159c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:234a917faab18591aadee127e228400b1490afa7c0f9124f92d72b22e8660467 -size 571688 +oid sha256:a3a0637ca5fb5442df68f2ec93d846231e4e9ff831dad53a17e0f896d584e78a +size 577064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 878db8d5d0..b251d7b9f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5881e330211deb3ab1a336e73cee6c8ab811a8a7982fc3b06e3a00c117bc4013 -size 652255 +oid sha256:1bebb047315b40fd6ec4ae519bd72e8a1a45b3041c3939605fcfa04db0430306 +size 659113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 85376ffd4c..72e8033b65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b465eb29282ac4cdb3d8aba74fd52fbea58b11f107beb18502eca9913e346aa -size 572030 +oid sha256:50d9afed9265a4cdf1f197554e8c4dc0c006f8d03efc1444e88eab99c17e74ed +size 576866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dbfc8d8dfe..2cf015033f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b2bf50aa52da95f679656b3534e779bd91f8a299d856a666b91c0f1a3b93465 -size 721357 +oid sha256:3c03589eea805c3ebe13eb46f20833e82a0aa31ba7985aeb73e597966e5023da +size 729349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 06b1649556..1e1303dc89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b553d782dc40e13776e5d0a4f3f43d9416957d51f4b028ffec96e7a6ec7bb17 -size 638271 +oid sha256:f46a0089389db8ee83229008c57efc048b08443efdc4a7868460483763562798 +size 644487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6d86b1e25c..c51d812d41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:814f78bb4cdcf2dfe08c8de48333b7b8c1a5b34de770d8aab4858916d0330229 -size 742441 +oid sha256:e7cd230c02fe4d9e56790a9e5d3b5e5cccff60849c511bfcfc6fa26c33adda26 +size 750137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3d9a95370a..5064578e13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d81d017b5bc319c1e68d68d394affbaa3bb231909578f8ddd4ad9306b52471ee -size 651659 +oid sha256:8b585b2dac691b662a21bfe6edc6e9b42ecbc8f982d195a0cbd8e951f6df8919 +size 665621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b62d5d8531..4c5eb37244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:927b0c8718a33210b6650b207e0499b74c0fb7659131a045ee2dd5cdfd21cbe4 -size 735287 +oid sha256:fc6e9e7f35e72ad5693e411408203255520f662755a6508bebbcb79b16ed7bf0 +size 740419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2b16d72de0..0336fcc285 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cd3279d7e978089e6d336d90dd2c37225546da47c108b510d1974e21ade4dd7 -size 644507 +oid sha256:ac103ed66ffc220e3a7a59190d1f35185bc10ae4c616adf5a94b814dcd0de1d9 +size 656001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7b9f168bfc..bd23aab51b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4958fcf6272de8419d500cebb478d7c2876dd08f8f10fbd985b3bfbc5b53f114 -size 737753 +oid sha256:4d8cf4ec5a82f63d1fdc0dc936ab4c86ed8193eaff40b8dc7baae1298ee11a42 +size 746139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0bb15ccc57..7197b0ca7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8137e6dbd65946b648b6f9d99276cccdadc58a3bf220f027b7faa8e8f7d91574 -size 652151 +oid sha256:5798b07f94f584266c2f9937b88be65c042e83a39b20fe665e9e062c559f8ea8 +size 664977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9bef4dc6bc..30aae4de23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92ba77b51ded52d8d13bc387e85653ef547e940e36ab7da3e84f7012de351c30 -size 730697 +oid sha256:bb940c29daa1b8c76fde141dcb9be33d8c7beae9d0114e7341de54804aecd281 +size 736173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index b66fbe59ca..d6ab9941db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f5f5fcc0204ef8ab01e09dd164da38e7ca6eec2e5de5ce237af1ed5bf02af18 -size 644997 +oid sha256:21c6c6d6d7bd57c90056bbb79894a6f5aad66ba65bd26e6d380885e6dddb4f98 +size 655357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 70a6ec14d7..67275cbf11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b4bc7161f0153108c190f4fd856bda389a2bea2c3437c63cb6ec581c09e0520 -size 811885 +oid sha256:c63e9b2ac01beb7f03d514e0302927197c3e09b712c7473d5703222b67c12306 +size 836403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b0fc6a5c2c..a7786cba05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbaab2a72ad9c9ce87d9d16f9c3d6d0248841d4676875ab4bd640e079edfffe9 -size 721055 +oid sha256:9487da4e646d2e1748c452aca4572f3a40c84b4d17fc93d8126444837866360f +size 746363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e2385cca8c..e5880048f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:048fcfda830f06af11fe17a9a5dcc6bad60f5f2169cc4e20709f8fc088bb2ae6 -size 804731 +oid sha256:2f7f3cef0466602d31d7d08ac4b0f684862ab569b7caf66c0f11e16ea233d2f5 +size 826635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4a447e4f09..749798e07a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35c2d6a1e65e4c1c373d0c621d4a5244e2749f24930964adc8ad6e2efd242d8c -size 713901 +oid sha256:82fbc7c4734782914331421301b5658ca3d1f88f999d38f76963b9d34401fe99 +size 737531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e170dca971..291121f851 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c6956d1f5bb9c3f093de9973296b78d45101465e58e1df8ea3de189397208a9 -size 619945 +oid sha256:f4e2e68320c1bd21a59dfa3191af359f6e9e65f6503c8e5e26fe55c846cc6604 +size 630057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c18242e754..5597247c50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6361cf4462a0c2e43693da48b7875ae00849c05cf3178d1b2a4ec63caef30d79 -size 532616 +oid sha256:9549947d20900b9e08ce1c14af7206cccd9bd0e43599e45abf57e03b86b67a83 +size 542236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d070aec447..9f1b4c7491 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0b9a3deb3e4fefbf7ddf6cfd0b47cf76fcc9a9706c1c72bcaa8717297fc41f0 -size 619695 +oid sha256:cad79130b1f9ba3843c334dd849e64e38c8a14a20c586225e8235ad4e6e1b640 +size 631535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 31b6cb077f..cbfbf0459a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0880dacb28ea94b70e5a35c3e2397ca01e07a65610ace3e77e080267ec7bc74 -size 535770 +oid sha256:ef3c911c051ef8a6f3a5bf8b425a73e13115f474091619653bf466a147962154 +size 545440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c3e9c214db..e1d4690845 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e256aa049ae257fdeac14c8ecfaa40453559b91870ce624f3f67d809c90c67c -size 686429 +oid sha256:75e7aacd81ecb41983ad07834b4dcf7da78218d8ac7725600114182b034990f2 +size 701475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 285d77712e..712bf0caad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1d520ee991a1e533c8c9101ec60c7a63350e0730abed130d76746f3d42aff5b -size 598212 +oid sha256:9aa71185ecd8433e0a999c31e56cfa10a1c5ee5fd84305f519864847e7610499 +size 608868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2f8b6d575a..4d84181ba4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80d62b0b28afc0daac2e2b3912aada65ed45a3bcabfa984ef79d0b8684968666 -size 708203 +oid sha256:50de25c050d456c8c11b2a5bb1434b3f9384cdb4a114a52584c23470258c7637 +size 726605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6407b65ce9..2544ac1763 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49a2abdda8662956362db948903bf4b3fb033aef0b0c84f11e3aa05e3c1586c2 -size 621271 +oid sha256:9a1d636b90723275c7c6510456937258718f9b162ff3ba42d378261a31b52a5c +size 642237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d448501828..8d2a96f4c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37327e358d6a8da2af6d9c9f650157e8094a3e784ca636be869a99cceab5b3f3 -size 694735 +oid sha256:fc3e66417f53f8fdd46806064636988e7db91497344f759fc9fdddc56446d349 +size 707119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 32b3eb166d..03f1930f7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3185ae47b6b764a1ac0385d810de7a4f71a0f1c2731f8fafc2f26a82bfa3bfe0 -size 607752 +oid sha256:c8ddbbaca73691eaf8993f05b8ee83ce1ce10c89ffe07fbdf6961c8a3ee092db +size 622899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 850e3d4f19..b012db0537 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dacc2206d01800511c214f52690a75d4b7548ae38beb9f0906ae1447f0a17b11 -size 712691 +oid sha256:7a0dd0973ea59ecfa68605510be643509b300922d2a948bff7065263df1d5d51 +size 735483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7cc09d919d..51d01a5172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a316eb21a858feeb64b795f7df342d683b8895d220066f0403cfac3d5cb076b -size 624969 +oid sha256:5f4c0af046c78f9ea835839db10634480fe192531b8b6afdb695aba2054409c7 +size 646823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1a5c2ae058..ac66ed2c33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c97e168b7acac9ba7474217703c52089e8a3ab1bd4069e0c96b93812d079d94 -size 699223 +oid sha256:166b189d2d1615a4fbcec1fff4b9f4ec50a45e82da03b79e9bf694d383e8d726 +size 715207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8b7a36ca97..1a6f621916 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a1906bfa3da0d022ceaec4b81f31077d092d91f79f1a9b0dd1c455a1cba8930 -size 612288 +oid sha256:8aa706fc064394f6ecdd5d75ced6b0c7ec5301a1683f4a5c91d7cccb6dbffbe6 +size 627485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5befeae546..394a15947c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d38da4d6b33787f12ff9835b259c543dbf294b261f51ea03dee3c83b92e9dfa -size 774639 +oid sha256:a0e6ee16b6bad68bff9ff108895ecb370fdec39056296cd3c925b668b15ca578 +size 819187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9326ab6f72..62cdcfa76c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91b2c09f1be9464d5431cd77dfc01e4858b4ec00bb61e6062a52e109de53f823 -size 689629 +oid sha256:a3bc3bde0dace12101a08995e7d70c54c0eb421760143780709f5a431ec85813 +size 710349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7c32a51785..a5f70d1a8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff519f4ab168a1a27355ee805285b186b847e99f411d10939983a9fc397b8026 -size 761861 +oid sha256:11fa2f48dec1f0a0847056a6f7b5014ae2f18687f4965dcd216344a6b213626c +size 799749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9e266735e1..e884002892 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23e64d81981a0138139bc91ec3033213d14e65de9d766c915855a453ecb2777c -size 676161 +oid sha256:798616cada5e440334caadc171c4fd91d6e0b76b1b58f97b2e66f757c6057d0e +size 691799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6730876b8d..c54e7b6393 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca8efd3ac65b04fb591491ff6f3fe03077c1caf04926f4fbe115d43154660f22 -size 616882 +oid sha256:454d924f0f227f1c1aad29cf5b0c27cb621945e690d7893475a3c3af3173261d +size 632079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 345529480b..7cdab204a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a54dd933925ae9c4a3d7c384343176ea79b0d40315a14f272a9c90656400231 -size 533798 +oid sha256:1708c5d1689a5038f7a561ec5b6d8d580124a7a624867342f8b0078cec37d999 +size 544602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 96eecf1147..51aba9e533 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96213342468a411fc31bba4f9d7ea2fc9c57d945327185989bd59bccd1d15305 -size 614068 +oid sha256:f6371c7e24164ca3d1e83c153410de9ee42b20e34336138902021e221645b2ea +size 628919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 4892f4d029..68f96a618e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dddba5457a474607c1250ce61344495a2f1e9130212e59246fb8f90e9a421ce -size 533450 +oid sha256:893771180273b86234062eed98d3d5adadda621cbf667ad3db8f60a35d9f69ba +size 544600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 57032053e1..eaae77f54f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:857094917d13cc70ce651a9656a354e69152500818082247bbeb4b96a03dc361 -size 683121 +oid sha256:02585c06ba14779dc0ab05f8d45451a323104846153b25c9f2e65d6b4d1b5af0 +size 699253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 42d8fcab5e..f05cedd07c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2744c87c1abd0edc4d1c118861b0847592d453a96e08abbaf3cf00875707f50 -size 600380 +oid sha256:a59cc8ce8048c8cb18531f7decb0eb6027d559561869172279835366df013ac0 +size 612022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b3e3668c7..3d0b910c6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9b5652e4c68ad8ce6457601565519ee863c2b61eff8bc7bf6d25b58bc73dc86 -size 702923 +oid sha256:4f3972147a17fc32314b2602b7cd4e0c5df93a4a38702aed3c1b30894cc88981 +size 720733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f7500b2e2a..6ded1fd6d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5b92320d9e0ea9a1241665497da6cf79fbef67aac8fe905fd7ac9a599b7d534 -size 614508 +oid sha256:c4f2fe9d8e4664cbc62f40f8a370aa10da28ccb22b99c0f20c25892d68ca2e7f +size 630445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c5647bfa22..cae42d87f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b66c4ffd1d60c40eab70b8d6c87915b3340fa249dce10566b10f9c472e53a040 -size 695769 +oid sha256:5dfc0ecdd1ad11773e75d71f629499078e7a32c7bccb6df984d27b7cd4930f64 +size 710965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 47ceb2a077..84f48b2f4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc4149e3ec600b460fded43738ab0f814e85673f780dfd290f11d308d9bb78d7 -size 607356 +oid sha256:58143bd44e827dee4ab076ac3a0acbb63cc31eb8867321a54aed6fcbc3abc042 +size 620825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 96133f2528..6befd23e9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1d1d7375ee509792df7b70ac8a6ad631b3a5d0eb531b6beff2dcde57221cdbe -size 698431 +oid sha256:ed716a57cfd4ca1773223da9170be32888bd8b766a5bb23c45f508cd75351d31 +size 716685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f55ada794a..b2717666f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d031beaba6087fe86c393fbef99570b3b6852864778ecf8daa1e5eee6e54d11 -size 615050 +oid sha256:cab11dae807f7fc288c64ca2517e07e69d59e4dbbc27734b75f931a5c6b149ae +size 629259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index df32117047..d9d191dc60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7503f145bb38e3fb551531ae302f2223a42e14e63219ca6b08661f7388ce3cc -size 691377 +oid sha256:7450fd675feb4115bcedb83b5cd7f9fe00a17b61251674aac6a4d639787ab031 +size 706769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0786207ff0..bd2a7507ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9bb9fdd3b71a56219ecca65c9ada61ae6143c83c87cd10813bee892a7017cde -size 607896 +oid sha256:f713fe8a9a8056a341ff455ee6cabafecbe20e830065bc18fabb6006a43925f7 +size 619639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 43436a22e1..cea43111ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8f3441f6a95314354cea9e85035a156515716bc5e4528f90146b3ac9e1860ad -size 772465 +oid sha256:69fd24963c2666ddcc0dae11dc352241f74645126c3f6254c2c835b69932c996 +size 806161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 803f4cbe46..97a4e932a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a32d30609e1d43ec58a07dd41a8c6a1050b62833be809ba4fa277f06900b119 -size 681981 +oid sha256:1a03990a2631783181903d8c52029dc8469a1ca588b4cbd1248a920f2f9acd98 +size 714441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a687dc4f53..1ea182b107 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02afb01edae404a2a696fcea55e0319c37986809897b9837ffe941465b41a7c8 -size 765313 +oid sha256:b4374cd3d0a921860c58af1b46d69420ee47eb46ab1c5593494e71e94d9c4e0b +size 796441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3f0eb0b42a..ac0b0d399c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e0b0f2115262d25827a426d0d82d5c378a476373138b5a4ff0a0a11abbf8491 -size 674877 +oid sha256:647e14f344595f7bfe6b4a6741d6d3193bd254d68b5a531988493e037145f9f5 +size 704821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e4706c49e5..99783eafb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5640248d6f9200e91911bd85eeb7eb94a99e32ce255b485f3b2cfd5b3ff2f3dc -size 654739 +oid sha256:7fcf01c3d741a2a1caa8ae1afbc198ae4a84a595f5f17ef7d99b358a61cc196a +size 676939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1c35cb28fa..23d15740f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1220ac3b6720c5e7a9428035969808e60ecc45c9e74155f352622c06148b98dd -size 566768 +oid sha256:5fd71197b2bc6876a6f7d59da320ac3eae711d0a9a49d5442e7f34c54957abcc +size 584824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b790b6a335..e66bbcb649 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:301a1077eb455dd3cd98ac595a3b74d518dffc2955b7eea64da8e82984f563bf -size 652565 +oid sha256:20480df206020429862bbc18e257f80c22f936426962b4beb1892eb9b1b15f04 +size 673089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 7838474b16..7449dcacfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6749fea20ae931daa1bf13c2356024abbb36a50a202beeff595e289d01c4efe8 -size 572884 +oid sha256:4b23635e92115a08903eb0d3a440eda3b47daf82a354bf2e26e72d220e5dbe9b +size 591876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 482a7abd47..310cf58eee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dd1f4f015083488e96cc5b487e21155240dc4ef68ef99def6c5e3c26494c295 -size 743639 +oid sha256:5ad7c3d888be7e0be27586965c4e50ef7a79c9303ba8017c1dfadbc6ffda12a7 +size 774029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3800db215d..7b9c60eb9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfa52a5c8f9995ea38f18a37d78d015ec0a5d6828ba2cd5afdb28eba42abe3c5 -size 651723 +oid sha256:11aa8528e878fc028b73a369b14d83973cb28cdf4f76dd546dbec5fdeea9f1b3 +size 678017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0e1cad7e19..e51ac317ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:633d2c1b0d0cf29dc48d693a9084215289b0ad2cd2c5ef6910f90864569f90aa -size 733329 +oid sha256:f8d27b4edc92168853983c12d13cf1c8009fea2b43c40eb63119d8b9f18270f4 +size 760511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 659ecb0159..8a314e46ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77d4dceef3b336f10c7e234c104ddb907ef75250a38309f5646876569eda0541 -size 641413 +oid sha256:3868761900f130c3e31742ceb844bbb6974befe3686558d0fb884e68b550d843 +size 665289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3bfdb888a3..309f2d2bd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0d06ea9e29125be9141ec6a55d9b840d205a2ac93654a7e763c4cab760a112f -size 741121 +oid sha256:eb7ab45382078006eed5d96e09efd3312fe4feb33246699da7244ec7b2afdf4b +size 767613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 13a12f2167..7dc75504b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a6e53b2950258860b089058079d4f2867b4a7200a6ded33c067ad033c96123e -size 664597 +oid sha256:c3b926c9afdd1c8bee895e7b9b96765f66a929949b64befc73525a3cf19c56a7 +size 690547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 59ecf029b7..e5bbc75d58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d720372cfab531ba564a7e9d941c3e81ac3dcd13441c00f14f68ac811eeb9cc5 -size 730317 +oid sha256:caff3f7e102758559cd68eaa40549932dd4ed0489136da20ed164e81797eb797 +size 754885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 21d53b23df..30c7d4782a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:350874c29df1250b13a55f40e32e4dad93218cbbd27718252571839eb113949f -size 650685 +oid sha256:176b6d5363138bd92eeb9719091164858b2205bda6a456f2fe1b761233702d50 +size 675499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 59e4a4a35a..0e09779698 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0d3c49ee07457080cadf7be2b1212cba38bd13c813db188cc0f2f39d850a2e1 -size 654743 +oid sha256:ee897f48edf2b6f2643d9f6e4dd0280ab511a7ff7fb4404bc6c430c4d322987d +size 677731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 00c3b3051d..f879b7395d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff956860cddd39ef263585a268f7bfdc93af4e8ac13b25af1cd7811520909962 -size 567562 +oid sha256:278becb99f844994f830598050d807318400fa92534b7858e8f6355392643d1d +size 584828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 308c4e240a..7429ac9046 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c583a374837aecff238f723389639b020873be547eedc774c78f7aea018da87 -size 653359 +oid sha256:a2cda4d9e911900875da0cca14ecfe09ca1e75ab172ddab66a1f4d819076436c +size 673093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index efac72939c..f4c327d185 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b26c94265ffdbff76fee049d75b6b9723a36c2d0a11e815fc7620bfa8e9cb95 -size 573676 +oid sha256:3e6faea757b1dfc255c2d7863f946d143a6bb48e1d771c8e3987f1de7fb29228 +size 592670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index eca8250a52..efa848ec6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:036f247d2d6408f757106d865f5e13c5155081fd22f898ba795bf10111dc0628 -size 673907 +oid sha256:ce8a5e335a410b4730334e604d7e6fcb22f15cefe1ea2a0843fc17902e985590 +size 673709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ff1ea1531a..ae8aa93682 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e41ee106b39452ef6c863baa32ff7a19bee9e2e969573bcfffccb9896ef14f7 -size 575132 +oid sha256:5aaaf8fe623802029aed7d8ab1360bfbd9ebb53ebb2878da83c94e9adfde508f +size 585000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f3481231a0..b558599cf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8761e076c9d4637a49247e13245e787e5695c9acc6ed05920cd31302e185b4b5 -size 668083 +oid sha256:604568faaaa8a4da90c248bba1dfa862ac2b1a371aaf0984efdd6a50685bf377 +size 670649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 5778a4b7da..06bb10eb84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df7c00fbb35090f468b3ceecf1132ed050bd8006d70a3eb7557e183484f7766c -size 575476 +oid sha256:5836402bf603ce4c87090c1d7178b7b9ffa65a04df6cc43bfd1ba081afd08847 +size 584800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3e489a5034..e2ffa1c2d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7df13fffa86db3c05a89bb9d85e284b956c639255366f4774c0fa7ab15fede1f +oid sha256:72ac94c157174e5a02dfd6d200b5bbd8dd86948016f0ebcc27b118ba69b807af size 740145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2b58bcf31e..9daa8ddb52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1eb95d5386f8f070482f50a9d498367f953697fd071a5a113a8ddcb03761730 -size 641717 +oid sha256:2dcdd6ea9990b70d39f77166ac72d96ff34a0fcee15d75166657b3cd63b71720 +size 652421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b30d868f85..9913bb1c4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:723011679b1d616fdce373f43969cd031283229a837fc1b11f3edacc96eb363e -size 761179 +oid sha256:ef4fedef0c97ff4d5dcba62c3655cdc445e9e9ce13a5a96ddcbbe1a9802be4da +size 761723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c1fdfba1aa..8ad839bfdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b39139c8da4afa90299e8aaceb4e0f3d9accf926776cd5781fe5b6729e252d11 -size 655895 +oid sha256:3b52e80fffaa7e4275947f3c7a950cea95eb7b63d6bf5e9c6ce5e209026e5877 +size 673605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 24f12400ad..ab7d23d056 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7bad6aa0c30bbfa8df8d9639308173c8aad7b9bf5145dd1a87834b10ddd546d -size 754027 +oid sha256:7eb91294bef8b611b7812d2a08bf69bb38a154058546b2f1a1ce0942129ee9cd +size 752003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c133500d51..5f92732323 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2a4daffca34561631ac1fdde277c64b5261008adda34db3fd270f505bec2d86 -size 648741 +oid sha256:118c0981ed1df3491c23a4c717da0e981062f16a98f71a54dd55b4d1465057c4 +size 663935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 857c5bdd8e..533008821b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:852e6ae7762247f4ea8ff14b170c2886d06a9ae79043697a8dfd26021856cb42 -size 755109 +oid sha256:61ef3fe3523bacf3687fb8f8ad488cc555eef8fb40a808112f09fe3b2c474def +size 756885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index cd520875a3..ecf6661e93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:596c7095448b1e26d25766100f253935222a8b9509b7a798c3092df9022a4d00 -size 655597 +oid sha256:99a2aa5f60f6e21f325eb522e0d254e1868e91ed3899353d0d533c8ac042d412 +size 672913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cab29c7643..3246a82dfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f41c96024998a8076f3e1e1f8b2ac39dd031f4967e2f13bba65106ead3f6569 -size 748055 +oid sha256:0d1685363d12b008c8dcc9aac6bfc1761900e425e29b5f9a47c9427ab66ea6b9 +size 746969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index e500535ead..9b089113a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45cc58185fbb16c2331644791497df272d61b5ba9e95790a962e1831716567a8 -size 648443 +oid sha256:778a53e7d1fa8d09ff42a2323f390d2c58f14b9a65c002caa8ddabfaab868451 +size 663293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3189ed2757..f3a1b7d44a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3bf09159875809054a4c036286ca9b764f05de32bcb16fe013742106a18420b -size 831413 +oid sha256:135a79761a31d7a82a491751beb7cb4c78dea9f5874ded9ff1e9097ed62d21f5 +size 847939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ed00b743ab..bbccf56568 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60206ad0704012088e2b533dbe244be49be4edc58f1393edf9c949fecd0885d4 -size 725289 +oid sha256:fc146110ccf99516350efab49bdef56f9cb96f97eefb3de483719c833661a2c1 +size 755087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 89e6a7cb4f..b64bc785d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a183584df039d75d8c95f36b384de81ab9af63c0beba8d396ddf5356b2c81d2b -size 824259 +oid sha256:c697031cfcf3481c612bad9cfd8d7dc53b5779ca7aa7d089771f0bc5d949caf9 +size 838221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e7a8470663..e8302980e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:012f807cd8a265d6cce37d5aa00f0d2d5015fa523a3cb6de19be0339415264d3 -size 718135 +oid sha256:65fd483d6091f93ee9874e08ca91f855ccdd556094317ff5ea66cd70e7c41ab7 +size 745467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index eb3636a4d5..19ff59012a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d1caa5ef026068ccaf4f1372ba02b82cc38ce0c8929171896acd39a373f380c -size 629161 +oid sha256:37aadfbf8585ed571b935f199bc7f6ac89931804adf568c23a960d3d7b27d972 +size 632813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e460a969a6..5f2a8ea4ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea4ffd71c5551e3854d2f26cc3ec458d05fcbb9bcea79ee680cf5e2082a49d6a -size 542228 +oid sha256:6a2090cff8d6fa18100e00a0cf54586f9b4b12e48e98ed29596dd43aebb7fb79 +size 548196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 82821618e9..fb5456e3a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ece49c98b26a50888b259c04d78de32b2caa60273d81a7b2237268ed91ba8da2 -size 630147 +oid sha256:e52e3059cbc04a92888e31198e143077d096338e5dfb18b9679499de37c2a013 +size 639667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 7f8812b17b..20af29ee07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a609ef0e50a7b82a46aa3b0e355090a63a4bb5dbdca7c5493f19831591094f8 -size 544642 +oid sha256:f6ba93dedc89c8eb6452b86e5ede89b8d5dbdb388f43b6e6019b07c8d80593fb +size 551598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index cf9141a9ba..abe0285232 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37f25d24808a394f31dce0b139789f59b3ae3634c0710efa1f004f07df40c1d1 -size 697865 +oid sha256:f4d25741f0d0ac6f2d06875a26d435450797af0e9a4062524fd02f40adb1832e +size 699593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8d9be9877d..a00cbf6403 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be54f3cb82117c5cb87020c4207f050fbec64728e29b24f187995c5f79f6aa56 -size 608662 +oid sha256:2725bdfb24b6827f31d935f477e8ee595f49f15fc60b9f7f3555bed357fba9b8 +size 614878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 70b2202a3d..943d8d1505 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de57181b7bd5c25607a256ad11063d4da0a23582826cb461d75785f4a0adac8c -size 717865 +oid sha256:f7263c4d031e4e324c782c5be6de45ee9fb43ba7b1c8806e80ccc49ebd7c6d9f +size 734687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ac9f515806..29572cea8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee28042afedf2383dba2ffa1e007f74d502332f82c95440671b5d233f4b16eff -size 631721 +oid sha256:909a1c0887e00d75500596ec1996c18160bedad324c95d7b652b3a2dfe38d9eb +size 648247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6b4688cc0d..2dd2f91685 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c98f38966666852cbb3bc4fdde45f53d74b5201ad996c87118762095f852da94 -size 704397 +oid sha256:482b81f77e5cc47b56fda408c0ff4b659c05a968ea45f7f747abad9b23566a96 +size 715251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 50693ed272..78f3c30819 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f863509dc44dd3e86f7fc5c029759b496da782e05dc80d693dd7f04e1ef5b165 -size 618203 +oid sha256:bd6e5459618951f1573eecdb909c6f6e236b6ab9e1214419fe95b0fbc13fca42 +size 629649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b8b3083eb5..72bf581503 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72d01af759ea1b6a7162e09d7f7102637d9ae6579fda4b8336063bf4c684171a -size 721465 +oid sha256:64d5bda0098d9c9de4889f7690a9ad82eeb91c1fa7ea1699da17c4a99bdba294 +size 743023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 860b5bf8bc..4d62f0fb6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0641ef5a792a2cb8fbfbfae60fce82546b7206643950ab782a25bd477aab6074 -size 635419 +oid sha256:38e72662fc7cd8bc5da33dda102f61227bfaab59dbc896c5ca6011d8e11f96c8 +size 652241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7c73cad68d..8e3c489720 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7587f048f5a57e199bff4a9fe970666edf2d180c63d243fef1d7367746e2b55 -size 707897 +oid sha256:7d87c7653627887bdd8e08cea26d76ca529505e36fe7a8976feeb2e9a0e70be8 +size 723537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 78950b883e..6df477f518 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5cf99dbae696aa78bcdef9b39e866f6ecc5e3ca4a48a2988f6a486da67eaa13 -size 622741 +oid sha256:315c529d607dbf00aa7d736516432107e59aa08499ced12db6850bfcdcfaf0dc +size 633643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9099153195..3ebd96c765 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72975f4f121babd6f088ea416d9f44e5b98c975620cdb2a53555cd081d187393 -size 784299 +oid sha256:38c75edd6ebc92525f2fcca8ed956744d4cc8aba0a0c03d168d9d1a31d1abf1a +size 827515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ee32cf2692..69c12db963 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05d4802de26491f6945a7e87b6e4cbe45341eb50805aa57c2bb234ff3aacd8d0 -size 699291 +oid sha256:59dd4909edfe9d55424ab240e9a6452f7da06c8f7659d23c4e64723dda5f18e6 +size 716359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 343936b8fb..fb5212876f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c83ef1a7fc85069c6e483ecd8a114fe827c08bd50b5b92c7ba24f6de72319e3c -size 770733 +oid sha256:812667b03551276fcfaff3103fe2747ce68683889e1384463f087397f5ec7d63 +size 807239 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index bf620c64d9..4bee5d11aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:246dd3cf44d1c14bcf9a329acf9a35d16b33780a0d16293311b6d0dbeb0670f8 -size 685823 +oid sha256:ba54247ac79a464633e9e148b73490bc00ad01b63408132bb3523bff1a1204d6 +size 697761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9546cff94e..b0dea7ce52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48ce72ce0c1967391134d7fea65575ef28f10a2d21c4459354f67e860dedcf9c -size 673559 +oid sha256:537a40395338c1171eed4fbf3de79c419acce2f5c14552b739c97b7e11d3b193 +size 680959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 389ea2193b..2b59e7058b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a14ea4a7d88ccc0e92e34fbfc9d0674aedb532367a76a8500fcdd83b3d6257f4 -size 539018 +oid sha256:8ca9e5e612f173b4fcbef1e580687fce8a520ccc634386a82ae211187dded1aa +size 549330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0b8a935f97..061e3a3321 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3388745895f856025111854af7aa335a8c317ef8a1f599d2e95f892e6ec76030 -size 678985 +oid sha256:16922d0121b1e596fa058976f93adda327695a0923b0ac3c2afad53ffd857740 +size 684953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 1d7495ee11..b88ecfda95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7aa505413911f8d7537baea3d2f9e2b92e659f634a2e37260aa800d7780a000c -size 540250 +oid sha256:9f5650ef304cccac96320f2372a440d952f28dd1cc74ef22979ab138dc4740de +size 549328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index de8b8ed417..e02e132bda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:184c3585843c85edaced1269486898d6be6259476377824369258de825a775cb -size 740587 +oid sha256:2ea8de7d251af7df04a60dcf8aeae21604a3a150e7c36325007d022cf122087a +size 747345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 54d60d1d5a..932f951ae1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce60175a012f2b7fb3d189cd811c33cfc6ef7b1c6caa47ff3741b5108b9d774a -size 605650 +oid sha256:ccaf83dbdf12f70ec6204b0dd3568bc19856a8cb3b966d937ddd8d9913b24631 +size 615912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index be9a74181c..e864011591 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0eb336277a3171def055aee1d43aaee8dc30ced7e9b5cf51f57b18aaccfb40a4 -size 758809 +oid sha256:4f2ce0c6af065a90fd43d5d0f6968bfba7b118027004015aa17cbcb8c91869ab +size 770403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b0f13cbddc..82a835e414 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09e31a0875fe76e4d1cc5fdf15aea81bed5d8721e9fae75989821cfeac7118b4 -size 619731 +oid sha256:86aa832dd2de3736e52e39d6970597601edabad3318a9a88fe9e49257bebcbbc +size 633593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8a706a3e38..76a7b8bc30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27a65dfc1b60a80f540c3f0646bb7a97b3f3c9e9d39e48c2a392ce8a0bdf5f57 -size 751657 +oid sha256:71b800542d9f472a8395586dbf1527dcf61ece1dbc2252e9001f79dfcf937074 +size 760685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e8ede17868..91eac549b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6997809c9a1d14c6033a20158a18e1ab57ba2541bcf39e8b4638b740ce6dc85c -size 612576 +oid sha256:e3f37da9cf987906f79c1d543dd10f9ffd2386602ad2b95383adc33d3de68fff +size 624763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0a38f75f9b..3df3aaa427 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89c279d01173c5bb945f0e5770bf44abebd16ca45a15377fe0603374a231f4c0 -size 762507 +oid sha256:579ed264ed6089b627db3f4fe3c872624f7529697d083e86163d185c19e032ec +size 773509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 884e3a7e67..6d7999bd9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b671c631792122f8b161534f0720671c5b70a3a316d22b8489f50a975b01869f -size 620271 +oid sha256:e5651600b8a8605645d487f1fcf5be619d4a187db3770b1423bd6a4157f05806 +size 633987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2257723d5b..3115babcce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfcbf4c7acb2efde228f6873316c999112e10b66524c2bde13a8f1020b2f1dbc -size 755453 +oid sha256:cab6ba642a64d663c90f726812146fda2d9b2bc96ce55fa16d99086a02cf8cb3 +size 762903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8da2d1621f..637f249c4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:130a7f99c4d4c19efb2f84c780d630a4fba8aab282c073b541d892dd4888f835 -size 613118 +oid sha256:b247a55b3251bc8dd8e537011e38021b12fde77b4a98a15efbf14cd0267f41f0 +size 624317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 82f6d9255c..4d9b7ca665 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:435a767e1c75ff2a1650d5c66c93944767abaecc668a61407960788d7997a0e5 -size 828353 +oid sha256:9243224f235ffeb20113eef25281e1ddc599978c5c264e3b3e0bb28e6c77f15f +size 839945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b4dde3e1b3..ea5a842d38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcc8db6c3591ee8097f990ace94031e0072d076c430914b21bb6a1bb1d23cc13 -size 687843 +oid sha256:a1942032e88e2183f30849b4bafb718aac55776c1da2568c39048ac1a807c0f2 +size 717591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bf01b76d2f..05654ad135 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c31d93ed39932445c42c5dcd7e03627b36a4849d08a4b8f1219bda25182b3745 -size 821989 +oid sha256:0f94d774c54f2e0860421fca93312454c6f2f003cb27a7b455375b3dc6541e67 +size 830177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index cf63f4e5bc..ae1211b3ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89ba8c84302cee403704d01d0713d8f11a8d6a19e4975a183e34f593768d70a8 -size 680689 +oid sha256:b5608c0d4afbf7a9c5d1034a0a2dacf590aa46b324bf8b063d6dee76ba5cf18b +size 708761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 42d858a25f..162bb35294 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d6a3f1692790e4be6c75cfcb0b20d017d432c431e7e5afcd8130dd5a62f349a -size 678051 +oid sha256:5263b3bf938fb9a217d196aaf16b144c7d554777ff3fcece269cc1493575ca4d +size 688855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c2790ff1ed..4fb3011509 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c6e8c3a94eab75efcc1deb42b190ad19c406ff7bc639832dc942df92d3d5148 -size 593682 +oid sha256:697406436f27436ab4463189fe7a5cf5159903195135a76d0123f85a0475e41d +size 604240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 63e7841fc3..c38c9df65a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:420752a04b58c7307056e525bed8f2acd2e3f31f9bc2c54cf1d69b8a96c0ba3d -size 675977 +oid sha256:c635bf4a3dbc28e697e015deedfc1598acdc03c75c99e07b7b65469f1c0c6536 +size 685055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index ff8a426639..c138e957c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62b8b387478eaa42de3ad751524c717be94b39146f7592e4c0baf4254fda49b0 -size 593236 +oid sha256:d61b1ef9069e54bb8de2264f3160fe6249f8684cdfd25d6cd10c1327bbbcd1bd +size 603250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index deb527707d..f099b6cefb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0417ed6011f969a0b8994a83e67d388ebc45ba5d4856be9efa7c3648da121ab7 -size 744337 +oid sha256:3bc703e7d58f8646d483cc28f35668e2f7bd2f9c0d2355cc2c00041c38737ad7 +size 755289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2aa1655723..ebb48c1d21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a6d84e5f0fd0fdceae9f40881beafb89b2e7ea180aaa1ae1832859d7b5a80be -size 660265 +oid sha256:25d9672de8b03782385093909f8b8cbcb7e58e2f25a6b3f319b312dc042370d6 +size 670873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a1570e4832..41d304cd63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bba4aba652d9767a3873d19a71fb0b4bce02cbcfc0a461263890b3deee4cebe7 -size 765373 +oid sha256:c7011484ba3711f824da473e56c878e6dc7dd7249019d5b31bcb236cedbe4d52 +size 776867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c91ed0a366..20675879c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23cfa8e1ac906021cbc21545f2e1cdcb700b857f630befc8c50801e4da0c84e4 -size 673555 +oid sha256:dbf447c202223f50b9ccf7ae6cfd60342d5fd1e6173fa8961ff74f014750d8ff +size 692499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c9183becde..bba8338570 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d21a79529aa925f8be1eb0cb66979c46973572330d19598298050606685bcf6b -size 758219 +oid sha256:0798df997db617d55ba01c595ce4fa0e50ab90f21e053630094577b2cc84277d +size 767149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0db9f4fde9..4f12f069e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f8dd9be21b9765d3c0af5ca96310081f903aea3582c64824143cae51f9d0c50 -size 666403 +oid sha256:2594a29d30b93709ff6aba50566d79676ba17e7f6c8f93568a44cf4fd5c0e5e7 +size 682879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c5f3e7cdbe..8126e252bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7df940242d916902220554d7fe7f0a8c4725e0736b1645cff82eed2362486f08 -size 761473 +oid sha256:69d039602ed5c2b4cd6f388dd18bf8780f13b07226c9175a8d8a2eb8a0071796 +size 772081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 939b6e239a..a5d5ce20bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6db5b069980ca29a1be8edc75c428b51027c52e5c86a530ea625d2e1553f980 -size 674047 +oid sha256:c0aa806058849daa53e127ec26804be5c837e825476a864bebe69b77ae7a0a1b +size 691067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index dfc7c1a19a..f153c95865 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2ef19954c794df2028aa96256e31e5d4b1e34d550cc838acd3b61d95e264c4d -size 754419 +oid sha256:c9affabad4a3f7bc25c2e530dbc63bc11bd1a91a0a82da74c2c0f100483d8350 +size 762115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 08827382c2..58c0023abc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dcc566af05460f903dfca23515b0206e12022f2095468d635bbe44a0ab8e0db -size 666893 +oid sha256:764e198a934de34971cff3a1d5ddace5abf7cb3c845f24fdbe817a4e71f55daf +size 682237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 238a5d519d..d7ac244e24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35299cd6cf244c83809a8aa5ee9d26414c90f5a296cf95b494066cda311d4823 -size 834867 +oid sha256:21a38c8ed5360fd9db17e1a80e7b7be1451d95af7d1facf73c576d7f284003d5 +size 862345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 756c2c7f1b..f09b49fb17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8bbc3236a9956ecc2d5de0a9b7b8b74fe8c425790020ed5ac3c730c4c48161b -size 742161 +oid sha256:fc2b5353b46cb46a43f28ef8a61026cf0756ce9bfac914f5546c0d0f18edd869 +size 773537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 384e80a0d6..d56671f0ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88095cdee5321560675993e31b2bf723284927c387ce4355433207b12c55ebd0 -size 827713 +oid sha256:1afec2a3ed994bb8a034f64780cbd22e25ebdaf2ecaca2be4b2a5f573caca734 +size 852577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2952c42e9d..bfa366c574 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e7f281be5c286270c5f0caed00dfb22df7a87d6f2d0eb548ccc840092c0c883 -size 735007 +oid sha256:8649519a87c169822a1dbce3e18da0e787d9ab4bef8ad6f086bd8528f2eca366 +size 763917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8c77e11ea7..7b05da168f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f64523011862d11c4c1cbc51a1ba9f7d189b5620f8a0d1969b1fe62af776dee -size 643961 +oid sha256:072a049e5ee3979cf6857f08d816f4aa7d5c46d4ba2ca75123390c35f61f8800 +size 654963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d319057f5c..3817e31738 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de562e62d51de4f0e42d26d6585e661a88b43d3883028931bc6d4a83364ab745 -size 554216 +oid sha256:2d3659184493e5c7963e8fbbe8ca7f08e4e956ca9af565c90b113c509c754db1 +size 563836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 017b60003d..96312c975a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e41574684594cf278565686f1ec993b05d5279387c20adce0b71fc2bc1a4efc0 -size 644503 +oid sha256:ee898d48357f26f18a46fe52ab84dace2a3e6dfeb7c1a2bfe7dd467bd22a41a7 +size 656441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 84da128ea2..491ae1f985 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:836f5b9adcaa4482d5c62b4efb3bb8eb1f6784687d0082d224874ba0f0b8b7a9 -size 556630 +oid sha256:99bc236be727d59e9647cfc3cf6b0a1fc08649b94ba55b00eecfd38403e4b6aa +size 567138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fc196b5cb6..c7f99310b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24fbd1ff753440e93df50adad8bf75cf04e2ad6e31709960d654cb4f7216d6be -size 711235 +oid sha256:9710986cb9ed8be91bb0b8e79214cd810a420c15e5e4ddcf342916d53a86093e +size 725591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index dfb5743e29..bc2c9efa3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:582a2dae7a13b143a0d469700d4e1bb60b207656feda9c0fd21cb00222e4bfb4 -size 620651 +oid sha256:71c8420ef97aa1d353aedf5c7cbffc737a37a58fc3f763d77c2c482a978f8323 +size 630469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 66cefec45a..e20bd87f44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:573e1d90616a837b65309bfc15c93018ed422c1fb164646379318ff4c347ddce -size 732221 +oid sha256:5ad8b1fc68fe2f38ee415a14000e4dcde168587beb9fb8908cf6fc704cb922dc +size 756987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 44ce4b9983..d5b2ce8a44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:741aabbf243a38417c6b540b191c2502f32d257bff5e3788dde1b12acfd051d5 -size 643611 +oid sha256:88e3c0fb89d925c39b2a5559b2818c9a2acc75b2dae4b3213a2ad2a9e2d3159c +size 663837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 321e7a2bb6..f0299c1760 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac21fe69824bd25a25919cfa97f07c52173f5c98027377276475ec0f41ac29c8 -size 718753 +oid sha256:f3ebb3e825879701bd19edaf6a4f4f74a8adc4d8c7eb2e7441661bef96e29667 +size 736759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index be6c01c11d..703db510d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2e187712893fcecb10ec842d28cf1503a9290a4fd1a9d3741ef0c10c84b867d -size 630093 +oid sha256:0778a6e8aeea8015b07d4aa63910650e3b4b93e3a4b1dbcdd72166fabd7c6f69 +size 645287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9e4c3633ff..a7f878f621 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81f72b32bc574b415d05d33799edb664a23aa8da317944ea709bb8010871e33c -size 737497 +oid sha256:89dc3871c923d5b1a5ea7df78ec1663d5cbb0798143ff03a893a86f05ed53bdc +size 760487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c8ddad2801..258d636cd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb5bf7499c3e23f13013f681c3d96b7e8a3fa4696c72559a0d6f06de0993f2d4 -size 647309 +oid sha256:a9a79554b21cd67811bc4714b6bd1da4e56dde0c117b31e70cf8a5a1d67642bc +size 667733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 73e9bf91e8..59c18c441d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:349c130d2d579b61f4915742c227ca77a9367a6a06f65657c31a0209a55b8147 -size 723981 +oid sha256:e75b69ba0e930a0bb9f651f59af766823688ccabf1e0cd605f2d309c320b5431 +size 740211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 16bac9dde1..a43eaf5e88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4174f20ababfcceb2dff4a7e23451c5c47c9804b8c4649e552b6f4325cc18c52 -size 633841 +oid sha256:0a730fc9db6dcbd64d3b7198da78a87f2909286fa23975f296c58ffbb8e27fa9 +size 649183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b7f876f9b..608e5d4e34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26dd1c4e7b1d2e86e7d79ce500796c8fdcf5eacf8dc5179faab75241432e3daf -size 798655 +oid sha256:0b8ee73faf818e7f773f2a2d6608a6d061c977300faee3ff36a5a4e1ea0a8c0e +size 844191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f6036f55b0..7a97444b15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7204dfb9bd0af1a3c56b25de7fe876d70394b7f827ace831009a4138cb58746 -size 710391 +oid sha256:17b39968f8b2aa12905b3cc7cbf73f8161d38485da4740859f2c7bf85e3533eb +size 731949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7f7815a75e..6a330235d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6379960a399dad4735290dfac4ef9dc0f2406c3653c279bf5ab19b70fd6c6d24 -size 785089 +oid sha256:e5c40da7b76a9c0fb9a37c7efd813d8e72d81ab614235526235449871bc40341 +size 823963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index eb289f61cf..00aacdc00e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b82841507a202e53bc27f12314eeb5c1b3023596937891443cc58010bdda3a5e -size 696923 +oid sha256:5af4eec961d88a781ae6224f4679ae57d20ecc7206c2b6a4dc9e17d9b15afa9a +size 713399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bcbd271a37..c823f815d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ee417d125da8919a4fd82314af48636692ddb4b8fa1247f3f80f941a0dabd83 -size 613470 +oid sha256:cb117e3c84735649a489e3d2c57777f439ba097830d494c278d1b289e361f98f +size 632071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 72f6fbdde6..81373591b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c312911fc1ddbfb60021d66451053b7b8a3a07f64d57402a4678354e2519b79 -size 530434 +oid sha256:924fc2b32882fd2d6eb960975abd404daa12582a9f08fc8c27f826cdb516966d +size 544594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index eac00c6a3a..647e9bf8af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21517a47d60a85d2815b4c411d6ef8c019d97fbbc5566e14bba85ce2fa12880c -size 610706 +oid sha256:619f551009aa0414ef9782622737ec7e87da4d1a1b4a1ea559deb934ff0970f7 +size 628911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 694896af1a..c285f743bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de186ed370b5f31971d4ed8ecb5f74a9ad3d9333aee3f6ab9008ca7476d2981d -size 530876 +oid sha256:6ece983e9727fcaef40e368ab27963a52afad0fda60b486d0a3db798a8401af9 +size 544592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3f13128a2f..25c06ffda6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26607bbe4314be36394b26df0240f47c661f4826576b2cd53fb1ca21faf386ea -size 679759 +oid sha256:5dd21b86e72567e9f9374557be9ac90ad09d0751385a48b09e0cfc955973045d +size 698455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 04191df754..d4d58b2e0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fc6e1dfec3b1ea2ff13ab41f035cf71a5259c288cbededbbed0cb4faae42f41 -size 597018 +oid sha256:3dd05a675320bc91531c92067a17352219e4274eb4730a0d8a0f5d2abbf458a3 +size 612014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0db77efdc6..bd62b14532 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2deb9d8da68587228849fea265d0511a9c63b639439ee6c04fd546f4453edc6 -size 698721 +oid sha256:307eb3767afb2a715dc66206d6d62503ff483c3976e5069a5baf4c0bf957c49d +size 721513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b7ceb3009d..3221d7d16d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8de076002fe1b2faa7b05de2da108837e6712977f29339d2bc89a6d4998f24c8 -size 611096 +oid sha256:69e3d3b1647c9c2cabb8cf4859b8a33b84f99b5e87c1e7b5bf3f2d3841fc8605 +size 629647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d975d2ec20..9452e5274d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e3dd68b7edad1aee1d8dcc19d881e6cb6e70f38c5c9d416569dd5b0392d41d3 -size 691569 +oid sha256:2f2529d9f41e0e26a45cdd4557b9c41bcf707c255f967a3ec6f2036584106473 +size 710957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3c7a08254f..5bc2c45da6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:838b70a26251b38f9d7054f80d49731712e8bf0d46c556854d9f7c8007d581f1 -size 603944 +oid sha256:19c5b6d086b65c26468eb87f84a508920644f328507213d1cb081d0627c3b3a3 +size 620027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4ada9075f9..6ac91adb77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b9324cb8ae74df87e724d51ee6dd5a7bdafb3f201b31defcfd404893f74a9b1 -size 695019 +oid sha256:688de7350293f9f9d3d9a28ce145eb371dc972443b20555c8c803470691a3900 +size 716677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8cb336a062..4818fc5b75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3c2feae1c08cc3d063b94d4b4173bd67b8c6f30866587c350ba826623a82aca -size 610898 +oid sha256:e747211e65850b19c5373f0bae8115116c9d8f6545988d20bd23268ff4535a0a +size 629251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9b054ae9d2..458438ccaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e39b857884077e47d12ff6eb2fcbc2966460e581e4ddce6ba0cb70e13526fcd -size 687965 +oid sha256:10f91822d563ad1a20e0b4bb544e8661dd13d14ff7ba7116cfb506a37ea2707b +size 706761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0261271385..dd4f0223b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:307f5d32bd136a7434e1229eba9f873786d88050b8249809edf6315ec112ef05 -size 603744 +oid sha256:ec92ff07046451bea37f86a49bf6958e5561d13f2b57fad374e470f91d459fbe +size 620419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3918da797a..6dddb83193 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a19daeabf51b0cb352db193d7455ead576a74d78d9fdf2c8da6c96e38752a3cb -size 768313 +oid sha256:32d51e1e288559482658ff1dcd11c3743a36337e3c171602fcd42c5768d2b385 +size 806153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ef9a50c534..ec41153587 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c7a6bd5bcb8eae78add85a62c227d2b0b91dc6c0424aad22a7f619a74416227 -size 677829 +oid sha256:2d154da62e7efcc3d303d05bcd2d42dda09b45a55e6e56dac10fcdbc5dc8f2fc +size 713645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9ea879a341..003a5821b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff453a2861ea2e048308b064fcd89dd56fb7f02550bcf049c0cd30d376d3f08f -size 761949 +oid sha256:3202109dce820bb1b19556cd076a81bbcd64626599f51f844836d4a6a17a68c7 +size 796433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3f642e0091..ceb34e3814 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fe69a1f6241aec9d056727273206113ed7c03ac84f50ee447e52fd987746496 -size 670675 +oid sha256:ec386f49ceeea11893c89c5a97af4b05be08b5bcc518dc00de6ba0ff8474ce8b +size 704025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 97b20fa4c2..a326be6cd9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28b615bf2e52457ea06bf9d06ff955020269d47acdf5d0a689271b915af9af33 -size 656689 +oid sha256:1e729333a705570e07ef63d3a8802723678e86b7da9177a4df6d8c98bc05b7ca +size 664533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 28faa16809..83ec867b4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:674f62e3d517d01c63b90e5fe5118fe4f60bb56f0ef211aeeed6e6fcbb08e7ff -size 574048 +oid sha256:aedfde3d3b586c6f1c9d424a9a33cd0bbc6c10ff3d854a3ad8e9b73693787f32 +size 578684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 25bbe01f69..8199f2acc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e294a18e24029bff77c01bf71b45bf85e58600eddb33cc045eb548adf8c4c8a2 -size 654615 +oid sha256:6c55da8e49540f9c815fd86ba269cd0d7448fb90ba78322cb690cc17c351e8b0 +size 661473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 4886b075a8..55fb44e218 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21e15a30ee4d3d7a7623a8df1a5065573e145b4b4d98443347a63e54ceb5b19c -size 573602 +oid sha256:36ac3f8b96c4631190c7e6f8f8903960be7fa8eb712a85819bf0f2ba693a2760 +size 578486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c175ce05fd..871e562047 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1934511648003588ca89da14e009fa4d1897f3251253d0ce4244ae855177186 -size 722977 +oid sha256:355ad4a2a8f4c5e24fa8fbeef11f0a7272ddfb77c884f774eff431d05ef90bfe +size 730919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index aadd71df8b..7e924b4dca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb6524be30b9af83590e2ad399bf1d9a2ff7540932c8dd9bf86a6f540a3da37e -size 640631 +oid sha256:3152f696db973e451a363b80fb0a9c7709388b71001e5cb1ab4690c0b1deb2d9 +size 646057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7715237b4b..4f2ef1d46f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41b494e8506b80d2f54c4e4b1174e1ef9dc3290df12d99e82458535b1905f3c1 -size 744061 +oid sha256:7ae63ebcaeff0e3ce88deb259ce99cb1b1f61e1502f946c6544c1ee90de6c045 +size 752547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4ca8981572..f6fa04b8cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7df4bf8af42a128416127aaebb153e40df87a14a0240db900b655d7c2b12bd6c -size 654069 +oid sha256:3e0e7fde3bd3136e65964d434281988d3c6fd157d8e7439c5d704a18978219fa +size 667241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a99e9f773b..105f84907e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96c372291719b4289767967f764684bd267a6e1cbd6744dd355eb7fcaae4257e -size 736907 +oid sha256:f9bfad736e4638b438140d2c5650b068463a24c561b93009f4c97a36f80551de +size 742827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3e61a6967a..587e97316b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10d0819d5f404a771490e4f3e90d8139230a049e8bad31336d88ca9a6f058964 -size 646915 +oid sha256:f6e39290ad6f5fdf27f2de5ef27519a8752e5fc5fdcb15e0d707110ff6cf53bd +size 657621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b720f2a735..449d81fa76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e04663c2d6428728a1abe61317b5b192ddae83264a2ff3b5c989226a900c507 -size 740161 +oid sha256:877b67ad355ef56674bd3ac5362281c0441c1da950cb96c6d69f34f60eb216c2 +size 747709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0f5241738b..6c879c92a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89b27d52353f9c676b7229604f574b72a5b152527cce3f139e35684dd41e02c7 -size 654511 +oid sha256:836956ecbc5c22802f4bc8f2bd9bc1dee5fedaee2b88cdd979e98c1ff03f567b +size 666597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 55049ba7c9..7b60d11998 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8b0ab0f59540e49784d82e6097a522bc95328851521f843f5dffb852d25ac74 -size 733107 +oid sha256:2b0e528d722c021faa4f9cf8bd327986a89db56b8c539a3441ab1dd69136cad3 +size 737793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 557b556f9c..aee9a31905 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65c57bf45c2a842d90662d90f194722325e8c095331aca1eb90f5e1e3f9863a3 -size 647357 +oid sha256:a7710790117e3388cfcca06c667a1a0ad8ec056d42210496620aeba7ad62a61a +size 656977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 16e227980a..4b023f4381 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9f16bb9a4fe51fe4f1d1daefeca6b44a55b907cd3561c2bf5a6c6493e23e664 -size 813505 +oid sha256:37861c7b5b0447cbeedd53601e4ff91b3f202534aff674b1ef917d61ec6c0672 +size 838763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 742a9738d1..6ce2761e92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:439ca5c8dd969e345c1ba021aa0798d013eeca5504d4e1fc1caf160c5f7ce369 -size 722675 +oid sha256:6069f3e80f27c17e85709dc704e52c0dd9705a756bdc372ed1edee55eac1fc85 +size 748723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9825a99e77..11582b4d20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b698d22ada8ef1238ca4dfe617b93fd9a68da3afebead74c8f0ac80470d78de -size 806351 +oid sha256:e810b3910760c9d0dff898398f553cdeb53de994108889aaf649ff968776c35a +size 829045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1e14cc2066..caaba72932 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e8f9f7dbb47363cf7112c6cedb3c2a295de1a307f9f596858bfb00a4afcc959 -size 715521 +oid sha256:012e214dc0801fe38d6679fbd5fbd277d566eb5a2e23e045cd306434c09c5aa1 +size 739891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 00b8bd6f3a..1e5a02b3ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cdaa0f0059475439101db3fb243eab1361c94e26726607aa3911ed3e49a49b0 -size 621565 +oid sha256:ee405cb9b40ac992d2cd637c86b2f947f3b365a3396e218fb3fedeb50122babd +size 632467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6033457266..fda39898c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b5bf15d0f3cb56afdd3835c49db54f8b7eb8e4bc3d98cffed56e08824a9db5e -size 534976 +oid sha256:f20b9fcbb6c23dc4beac5a1ecfa4d7a44da7dcc2f3ed110ab89984330f1d6c94 +size 543806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b5da7f8b10..bb987025f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef42330772cf2db63bd4860399cc6c7a565c14d53a0b075d2ea3d1a31bd64c96 -size 622105 +oid sha256:adb80beb81f77631f9380de783e6d481a38225b3d77fe5b703c3553e8720f67f +size 633945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index f4690e522d..b40859e819 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6a86d4d0a9b96ec8ac30c821fbfe3932cec16dadb2b90799980a5afe5f62ff4 -size 537390 +oid sha256:399fd17663769398e1cbc5c295fe67b69ef763c526da813fcb7a2d25fde0db54 +size 547060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 263969348b..2e58bfdcea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45c032e84f4650447c779fd113f6025b311e09bdd66b79840fa0177ec7f9874c -size 688789 +oid sha256:85e050f39a4241e095c4cb93f790634982604d116d8c61354b0398ad3db14897 +size 703095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6ef4bee8b4..a48fa13439 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa59441d959f8ff57d58231114bf916ea9bd589879599bf2818987a930f14862 -size 600572 +oid sha256:f98453d1a9a2bc6b0b381079fe06b31a01448b9f6f96769af580b136fb0ab85f +size 610488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d4da5cb4e2..ca48f31bd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d3d8ae5cba1d039ec2e8e9fae2345ba08c478832b39a5c462f7a7b85d43e3a8 -size 709823 +oid sha256:7572cdaa1f259bdd828885c23aa27ba33f3074a4af94f411cf8d8d513125743f +size 728965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e7d31ffa44..0c1a79cef1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d385ca3300018cba780cc65522e28be257ffbe3e89dbf433b38d8030f82673aa -size 623679 +oid sha256:8d5f8d1048294e98e6272d170af94c258d6fe358957b5ed7b5ff9cd614b5b606 +size 644647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 221c707f0c..e5f510c9b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6944470513526e094b0d61d3641225b9ee1bebea89436d379b433abe26efa492 -size 696307 +oid sha256:4eece5e197de6f4bb0b5d1a70e3332675554764a0ec9ba95ec6cf431afbd8b9a +size 709527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9c525b3990..ed656c0016 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2462aec4ad95211565fe9428cb9c8d0d9aa0a91548303279ffa8011d0fc0d28b -size 610162 +oid sha256:6c666028da4cd029fc41974865020a600e17fc8215370ae068f6c2300a24869f +size 625259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ae888443d2..f2507d585a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13430d8311ed783fa1f342c6c15e8e20ab2bf16ad91dfc6a1f959dcd74f6b8e4 -size 715101 +oid sha256:cb380d40a308be6a5ebefd6e43448a38337efbdafbb1b72ac985e0d9d973cb39 +size 737843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3cf89d16a2..ceb350d905 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80453342ca7cdec375d3485ee24d40b4f8a4472800ffed7d409bbf8cf0e02b71 -size 627377 +oid sha256:1b8b929f7bce3a4ffde82928b9963d391c3efb797dd3794253014616c4c437e6 +size 648443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 05518be8ad..b03ca5ad61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c78d00baa9c98a324a9c169d3db6948bbe624e778ace2e2029e6d4857cef7569 -size 701583 +oid sha256:bda7c9a2b3beb2991fa3a695cb766d0786e4571f5d5c03adab79ca0e6432ada8 +size 717617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8d62e33320..fd14ba0d0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5e79e03dcf448ca955f99224ea2ffa7a301be17a2c66b34d0153947b48dacfe -size 614648 +oid sha256:a3b2d26be0dd9b3670e0a4c8a9c69d06d9a627bf8d439496d8b02067266b0333 +size 629055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ccf7693470..4cbf536ef2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e973ed355fa664a08c467e18e740554789310bb15ce3735d7623c036eea06d62 -size 776259 +oid sha256:83e898c6557dbabde19276820f781bff165c4c53c9d566e427b00a5b175c90e5 +size 820807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 838fb189ad..6a76a2816e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf68e0372e21f719ac7a3eaa5b899cbe7337f85fc320c7a809d87b726a5e8a72 -size 691249 +oid sha256:f0f0845d75da523f7ee26d969628f3c1fa45bcc279d9d56afcb2d743470ebaff +size 712759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e54d8199ee..2f286a850d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:476e7f2597af40a19fa85645565e2e2f768998d2515e92126f4b46e685cd2014 -size 763431 +oid sha256:a9c0b7185638b66cd15c9f3dd1e22c99a7c2793813e4ec448299320988f078c7 +size 801319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b171f3fb8b..7e9e7878f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efc94faf855dc097ebe6fd66df671c2afca2d69d50169c7d7350e34568d44aae -size 677731 +oid sha256:818e786019f947e28296df9ecf556dae2f1f2635fa4c1453621da9212b443e35 +size 694159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 235e6d3188..6fdae3f629 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b1db413016ab05e80816d5f02483a0ff51e2cef550388a654dee4610869b1f4 -size 621661 +oid sha256:8a5396f6d427ea7c08928f6ef43f3c39932d2cac8880d8392ebedbd22dda039b +size 637645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 35205509d9..a08e511c8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1eed6af15ec385c26a4bcd86d45e9d6669ca18e11beb601efbe005135b5266e -size 538574 +oid sha256:3aee18ed353a271c93504eb022355308d86eca27d96d835aa0b361c065ea2323 +size 550168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f8de4a0b0c..702656203f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05095fcc3fae62698d791b3a32388897cec57b316ced0304c13701468070b732 -size 619637 +oid sha256:80b5690036098e1d1953f46d133b76700f6fdabf4c2c53eabedd37d9cdce6ed8 +size 634485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index fca012a36c..123f0a844b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87db3f7f832ec0fe889a258fb1eae1b0ac428362d7ec277beb25b7e0a18e28d5 -size 539016 +oid sha256:eb9bcb6ad4174be498f0f83decae9827d9a8a93b624247157853f769484e5a72 +size 550166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 09eb2c569c..1cb261bb17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a58445b983e9d19c150d7998ed7dc4bd1cfee439db3301abd1f4fc1e6e800062 -size 687899 +oid sha256:60adb00f534340529d55045b205cc9621677214476917832e2c043446b7beb0d +size 704031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 18474c3617..732a5dbc5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4c476dbb1b2f058e60622811c0b846ae998b1885f97997f92a2077b4292d1ee -size 605158 +oid sha256:86cfcd24dd3b1ae83f87a926a7e058fc2abfb0a787ee0625df2e5118b178ebe8 +size 617590 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f7b31caa2a..190be10c53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebfda5fe619bf36727366cd602000efcf3a8b3741ceee1c54405ffd1d99355c7 -size 707701 +oid sha256:6bc09927ad4a76202109de07a8650236aff55759b23ae08746066a99b818c166 +size 726299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 968f4fbce2..80f5124da9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fd0e8ce8be1577a4ff8fc0585dfd84faaa8b62a149a2c01d5771e4cc4ca8f89 -size 619287 +oid sha256:a82b7d846fc564f1389db98f485fc4cdfe5831f2b7517aebcbd2c2394b7d4357 +size 635221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cb62ee1795..aa78bfa6dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c3cd0d9c620f0f1df9983932560b30fcafc7eb58d6b6e977a9f7324a4fe94a3 -size 700547 +oid sha256:92f56648d3b7debe0d9ddb63042eb37098c0c31833b7cf44dec7cdb95cca7618 +size 716531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 31ffd60a4f..f8bfde22d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9583f49dcf4b939a23ca985691a7d556ce22b8dc5c3aa6c8e183a22aa4dc4119 -size 612132 +oid sha256:a279e3efd863b5b5b81730962d861dad9a11b3ae45e5ec06ea942ea22151146d +size 625601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ec56961565..be0440589f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35745a18a9a363ff24bdbf76072836487b5948555d909981c3f35102bf1594e7 -size 703999 +oid sha256:c08e505ae840e375b0a3abb54478466e71adbac319ec9b5c5cfcefd24abb5d8e +size 722251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index de2dd17ace..b77a147745 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d29ebc3e187ba3059108029ae7e9c585675b7e91de6db39e97fb4e79aa3ff21 -size 619827 +oid sha256:6fd5fb5ff2671e79322c469228f38594a7806be8b224717bf8d4f827835769b3 +size 634825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e162842924..ab1a26915f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ca862c6e1a54d22531e9a770840741ab2e99bb2c010ea2f1f87267d4331ac3f -size 696943 +oid sha256:a2d468c3ad18084e628ae085255fabde2385509c5dfa3f2c58128e27fd1b43e0 +size 712335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 49c1153045..eec88b62a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f967145b6396849495700d59afaf633cad48acbb63dc9561c0bc99f0fdeac038 -size 612674 +oid sha256:a58e5b6f26c00b1944876b03021422ed69deb8d5afd66caa83912568600c3f95 +size 625205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fb83bb5e4f..cf8f39845d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fb58b567d5784eb6c44153b3be9f74e3c64c9bc2406c1cb6dc97b6f9224be12 -size 777243 +oid sha256:dca35aeb6d46c3f1f7aad8d2b14e4a3831c55070af3274162334fd42338b972d +size 811727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2b0246d39c..58a943efb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faee53e011b797cb5703e289f6a5112eb9907b625968c6e25d491667a4f0bdc5 -size 686757 +oid sha256:baea9720c003add7939ce4e900ae043faa7fb8d8facd5352551251c86a208054 +size 719219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 790e55f772..79022bfabc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43534c076584e2c28d36f39b9ea955594d6a1042739ac2fb0a81dc63cc7f3db4 -size 770089 +oid sha256:4968ce6337a67353233b58bc4bc70c36db7254817c6c0d4cb9eab29493c27dfe +size 802009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9bfd8d3a05..8bbeb0d3e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21441a567e16615bbd72106f317f1840f5190e94dccdcca3ac7034d6e6225bc3 -size 679653 +oid sha256:530cba8ae53a9aa5d5c75e7b3fe15211c7df6cdf094207a77330665265b5ccf9 +size 709599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2997e75258..d03784829b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:829f9b5a04380afd626237f0a7c76b4968a8feb34735833e29261d4e0c2e28ec -size 718505 +oid sha256:6d0b24995a622d48acda2ae1c204bdffd0db1ef1010cd1b85440a78399b4943a +size 703063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1bb590b233..b9dc32093a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6707589e8002a2c66ba6a3ab5cfbcc30982d6035e35015302f6e79fd9d2546f6 -size 630535 +oid sha256:00e7de33db424f89abbcc98f79974c33121717c8156ad960e30b6ff3d33885b1 +size 618497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d785e14b30..a80e052f49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8af25a933767472b18c324a60d4fffb6dc834eb325949edc00eee99459d3508 -size 716135 +oid sha256:4edc3c998023d396ce01296fcfac3b854b739e6cb0a6da83f8d1ada487af1fce +size 699213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b7e341b5b8..793d3ebdd5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84adecc200f464798c7eaa59bc547361288b8138c043f3f30000e32fd1155a5f -size 631421 +oid sha256:b1d3ed306ccb6136e0f7da44703e93c46ede2fc3c2e8faeb6388d74c7129baef +size 617310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 36d91230da..925d59156f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfad205f962eb5190b92d2cc93fe2ddbdf4a96b856f89bfbe65f96a36e8edfb1 -size 784741 +oid sha256:c2246c0ad795b40261cf06194d71a2deeb0bf0aeebf2fc12d511f7c7e2578210 +size 770287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 85375626ad..e1eaf8d14b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a8c076a61c6a72a49968cae8365f53ee53a5898707e9c1e422043bf2f17dedf -size 698943 +oid sha256:61d748fa8fe3f173ebc982d390f8dca82e2dbbb6d9b6a0444eabf8a9a3f88f3a +size 686807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fa6ed63b4f..3f474a0f43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ef14e52432177cd2d4d93db240cc78c6245b4f4870cd09f70b02241cc82c2cc -size 818703 +oid sha256:b26392bda0b602065b3b1bc8207f02e367af4323a6f7ceb887307d8ef0387597 +size 812387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d69d4b25e5..710ed9bad0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8bca6ffe309f6ab03c9844f86508512cd7293981aa35dd953ccad9556a7792c -size 721951 +oid sha256:a8516a983fe31458d97eb5c5b34506ce79ab307a2d0f238841ae4bcde30529a6 +size 720965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 754e4c2ce3..2a4bead335 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77f491d0d51c7427c1aaab7be1715c1406bfa44e1874f1de7d27657104b5f5e0 -size 805185 +oid sha256:42ef10230e4df1915376efeac8e7b826f12c599d539303a126dc1ce037b86bc2 +size 790829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index cfa6a5a28b..e80d966467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:866d21f663203e2821c5f36415e878f6d9f22632f108acdc278681a5485584aa -size 708483 +oid sha256:ad714a5398a72b2ae304f013bee4156b31414bbfc2c464b4bd3538fe8253f988 +size 698025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 36ec70023e..5431a18df1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21ae1a6ae80c62279877a6abd651ca070020f3253bb8b91f23d471173bfa5bc5 -size 814013 +oid sha256:b55a7de045d84e9a6eb6c802858c54f6e25c1b4c82ae9e7692cfc5d010c5129c +size 804591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b7f685c302..6956537489 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c36a81f0a2de4c280cc0e7dba52bba1c52434d831aac411229c561f1b208e20 -size 722641 +oid sha256:f280cce78dcb181cf7b5920293b66066025e376f37dba24f22b0f95749c520d8 +size 720075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f9437847ad..bdfc9a51d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1ff1a86a78690260d44e11a040f31ef87beeae399c702bc0da8bc21e65f2540 -size 799805 +oid sha256:7b335038fdbe0936489307bd76dd67fc7aba3c024ea090d180b1619ef71e01d3 +size 785105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index a8bff32869..20d394a667 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b1b09da3e1a867edf8df8f59282ea9c62a4cf5b27794642ac48a184745f8279 -size 708383 +oid sha256:01313bf0f994feb533b2d38fb0b53a799f70fcb10b5e3bbf6d95205f118aa032 +size 698221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b92ecb6f40..eaa6dfff41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a2b872fdd8b0e780354cb5fbb450279581da1d9c94228402327ae591681ed52 -size 888935 +oid sha256:6706117c8f932fd11acfaeab361575eff6d68efc24554da594985b2c436dc537 +size 882719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 02ccb37792..a188697ec9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f146d5be2b10180131c49aa95c3a31e398f5df0f6f6810cc10e94e859572e91 -size 792185 +oid sha256:acc90bb1189f21c0525302fd642a726922096aaac465bcea72b117ffa9932f31 +size 789619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ad83adfb5e..7448774f45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdd4e3190fab44087f4836351c229f6089c8974bab9db91fd3ba15e63f57f36f -size 875419 +oid sha256:7c24d1cd8725c2a1379679bf943672f50fdc9a1cfd401356500d23022b20b267 +size 861161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 51f6dbc16c..c576565a7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebe054489dc58e1121e38f457c554c270b4f36a37140bce54da534880ff2786b -size 777879 +oid sha256:df31f5fda81c26c863c9159adcde1b75214c11dd125d442eaec23f6715e1b944 +size 766729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9014a78163..6a84525dca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56d1fd0656152bcdaf0fff6412e37dbc5257a0abd5a96b29174560070abafbe4 -size 652891 +oid sha256:fe09f968f8c9407e7c8c11880fdb256987f5e240b471e7adb7ebaa8d669aa8f2 +size 665569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 131e8048d4..bdd9546444 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef614095dfd19a600c413e266c0ff62c3a560ee1791e2f8f2e020042ddf25724 -size 554906 +oid sha256:b26030c698b0189dea2f4e03ef83bde8e1d1624c7030f745255c3cd68b725a55 +size 562996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a8ca2fb6ec..78257e9afd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:041dfc2094c4ccdf35ecf67083e3d5bf237769f77c2348ba88c1a86c25485ed3 -size 650323 +oid sha256:caeccdae232a0476a051e74cb5efb1bd5237e66974b3724a0ff7b8ad5175e02c +size 662953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 800efc06bf..238e7f65b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac817660748a4ac8cc076d75532001c6983cf4233dd67abbc35f009cf169aecc -size 568766 +oid sha256:2d34b524bb079d4ff9226bfaf4db2855e20f7dd8089ba8a48ed6c389d8f0d705 +size 581642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 255d347fe1..d086824bf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de6e31a2152c8a34325ad66479238b0325300a0e8770f9300d1965d93f194ef9 -size 720017 +oid sha256:ec58c3c81ec8291c6ad9f9611132709d4b09190bcbcf99733bfe922fd4f14a6f +size 731313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4d95d7e9b9..2e1228ef90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6a583c39528fa375b20297afcfd931c4b4568c851043697cb13bc86b8ac44c5 -size 623217 +oid sha256:4e81932f7b2d053ca1d999c955444be83aa489e2aa4885928bbe3d2dbdfada04 +size 628841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 494dc197eb..8722932e33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e89a6012c3cc0c3f75edc26d4e1753096285d3eab5bec0a4fba568b8aa87b6ba -size 771095 +oid sha256:e7f1e047134e575045b6ef10736fe5c317082e6877a247100159aeaca3f3ca13 +size 808737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 47052d9d1e..2b665ebb06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b917f639e3a291100b4a007f1db56cec5189caf3716c8fb8292c7defb9c4f20d -size 664183 +oid sha256:39023d35d608463e0481f3d8ea3010945a0c7cd08f7c5df71a0ad22b5a52e48c +size 687517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 12274ebbd7..8086b4b0fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29ee3ac88aa46d5912013a979840a9040ba89bc0418421862a0e067f955fd34d -size 743321 +oid sha256:fda78177b2e27237db16627a1a920391a2bcfa1e072205f8b0bf7d64895dbb68 +size 755211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ca0e8b3c41..31be8eebf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b3c5fcb1c04ecd0f3c650ebf56885b680cc67afda7c46d68ff3e8910e7eaa38 -size 637197 +oid sha256:b7221a7050080ca611218301634b8de7125de5776b4f33846ecb89f161f4810a +size 649975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bad727caa5..329a925acf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d89afaf7b1df4115852e59acb648e153ef6a645d55864ec783a0b8e25c46c03 -size 764877 +oid sha256:b984bc706f101e5877e20a712da48fe6dd08e78526898e315c11b6ae747a638b +size 811497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c6d494d24c..f5b6ddefca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56b82c2b2c21c8cd9bcecd9c0d582635d9fca1021399397f8870038f110fd02b -size 679375 +oid sha256:b94d87325129381d2b1d95e8c7c27c60d76d8d1fc07c3b2554fc64d3959b0c46 +size 716967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8244ba84a7..a2c0025688 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5d42db3032034c00e1fac931fe841e53ea3b5f5ed712db1697730ad22b4d54e -size 737053 +oid sha256:a21484be7adfb680e50132e75d22a98bd6d18fe689fbfd0dec389586fdb09a77 +size 756393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 85b667fa68..1ef1039c39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0276bd06c00b06f223884beaa4238036d5a4692418930ef8025d795e77c8f8d3 -size 651601 +oid sha256:31571581b4af84e151c1a4bc774e7fe5b0d1b1a495bf5383c3d24131d00ba4dc +size 672617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 91460120fa..3e21f176cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8708f2626e81ba1f33711e506114751bf725b7c49baf61a78247e5c3f241173c -size 834225 +oid sha256:e17a9f23ba2a5d43dd4afa9908e70564a52ff5cff9e3f0142f7bc50c985a0bbe +size 880845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d53d731e8e..755dfb5425 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:554f571c7bbdceb7fc1668a7fa202dd1943cd1258a613999b4bfeb68f0ffb630 -size 731555 +oid sha256:48fbe4b399027792f213339b6167f396959cd070748456f9649b77f66fe1b6fd +size 755481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e422cc5818..662ca8550c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a7e09ec5671e67d51342e64f2e0f3fa4548bf320a1cfc9382604643cfa186f0 -size 806401 +oid sha256:03f56519ac744927050c89dc8a08fbaf0fe327946a496c3adf0f39546350675d +size 827615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b0f5e4e1ae..3b076906be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c1ab50fec646d90968d1fdf0aa8ab6b5a0d5c0af0b9003b951f0240dc58da4c -size 704569 +oid sha256:e672e31555a7995af23b5072588f9f5b0dabc969e64d480d4823b3fa3ac4d0f5 +size 717939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c9302688ca..783a7a813e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b94641982967b92bad8ea63f44e1ff027bd1a89e21934a296ffdae3091089112 -size 657181 +oid sha256:2e670350d8d0c3a769b6385b11c1f692b90ba9bb1d0251a9b4ccd84a2c4176b4 +size 676915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c4222fa5b3..3c471412e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77ef1c3ad7098132f3ba2c7b1e3342fae47f41239203f37fb2ee987842829fde -size 573454 +oid sha256:488071cd7d68b6a3c5ddb62cdb48dda69482656aa06952bfacfce87e01b42fed +size 586576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 063240635d..5920f96d03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80610f020b93a7929cfdda304ab464cd1d15de6a337a6d9017db71bf9713ef54 -size 654811 +oid sha256:c6b99b3b48152e2bab362e16b758474836c2d9f2c025f20dcb81282f146131a1 +size 671683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 57ff100886..8bdbb606cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dab8114c75ad14cb94d8c7df26a0a38931a5e8c11562d680448347c973ab231 -size 573156 +oid sha256:2c1c33ce67ac1a89700de755b5cdd5bfea4f0236e61a733d4d592ff2c9c25c54 +size 584700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3fe5b2b15e..6a7429029a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99f7c7e4102da56ddb415cd23ca66cd6280ac1423c8caecb977318003efeddc8 -size 724207 +oid sha256:4f5481becfd5f424ffe20b3d29f162a9dc46c776c552470132e7ce83bed1f764 +size 743299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7b565b6f10..372499790b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2f26c517b276b1abd4304bae987723c9e5f4fa110eb60c29ab42240eed000aa -size 642355 +oid sha256:898a1819136ade3b7a06475a68618e34bb41b0e9f7a139d151943f785e2093a2 +size 654443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index aaaead6c99..15bf1216ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b91e0616d99c6708409d48993b3e2f28ea5049a44b890a8516a862884e42656 -size 747809 +oid sha256:1df5a1393dcd7cfb1bbd33daf52a8f84a89e8bc52f761fc5e1c3108789ea790d +size 768529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f23499a139..c9691289ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01127793f0d08c93f6ceb77d3b02fc4f1683c6e168dd015ad68b90654e8e82aa -size 653425 +oid sha256:c396c26c1a0bb73a31af4338da4205e61d2f2a96453a7cd15c0885f24a4d91bb +size 673701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 11fcf579ce..32636c5a4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7acc4dc55636a532a9432f8ef5fd43512964f7eb8db9a17c68e1eb23210370d6 -size 740655 +oid sha256:635aa7e17b86fd1590122e7915c6d4050324dfa220f4899116fd7d7a6b171ce0 +size 758809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a4fc7fe32f..5fb9d63cd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84297c8ecab43361c8f5939d9c3abb92a17ed8d5a7e41168041e825a00c380c2 -size 646273 +oid sha256:f3b76df27424e0c03d01df7e94a769bacd98eb13e16ffd565bec98fc7c819283 +size 664871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6b1ed91662..5cf4d32563 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9181356e8845bff0d63fe97730ab84758734bad77f3af9787bb69411fddd69d -size 737299 +oid sha256:7e09cfd17052d8f4ce52a2e9413d4ae1cc5800058056bc1e386fde1198ff2c42 +size 762507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d01cbb59f1..476b93968c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a8dfaf1146d61945c3270c7b51000232101588791e17f3b0bb7a5fb8dd0e664 -size 653227 +oid sha256:0a0b912f61dc6931e117ecb3152401304039af7812f88593a88eae5e4ed051d3 +size 671973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cf546b3fe3..a917eaa3bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a78783aeadf1e0f4473b631bfcc65765de7ac7eec5209f3ce8d60b742ae5112e -size 730243 +oid sha256:8693801145c30df29cce76253dce6d015c941a667052b4c1405cb4f9f97b4726 +size 752691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 24361ae5f8..9a5dfdf99b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:376c849d60e4c393002a60e299098af2947786db9e8c7340f235d235c9986a3b -size 646123 +oid sha256:821f069de6ed274c590b830baa2f060ffcda555f23b61a04092b1c100b56384e +size 663143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d7dd4e0f61..d285b0bdea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb515ffbbfe1990abde9962bec1e5694d19488e10629239bc1bf9683251760a5 -size 816809 +oid sha256:a9931240eab11cb97c60149c2f137b81cac82cf7d7eb8b4f8dfc4a5d905c972f +size 838367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5d04523630..dd0cedf423 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3c5ee4cb3e08a2cfbf97a53fce6852f173716fd063e5df94ca8499fd0af82d7 -size 723955 +oid sha256:5844e29a8e51ae7b3959460a289fad7b3939f3a7dd6395d6372db47c1816d747 +size 741715 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6e74285707..390373eac8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ed85a5e7c1c1d14f255ec37245c97fc7a706019208a9e625d0146f8c221aa04 -size 809655 +oid sha256:8b2b3bb7745157cbcb664f398601d5fcec7d891cfc536fa95b61a856761181fe +size 829437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ef21032881..d75f7bede0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:590c576a36b382609f5596d93b437739fedc31234cd9918392aa001a990e90f4 -size 716801 +oid sha256:9146823bf8f2afe995985072dffe22d10e77ce1c7b70f9ac5c823e56379ba1af +size 732095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d370446747..f0be43f83f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0571f33d3cca68a51d1b510700d6d993ad59013ff4788246c3d099f4eaa06035 -size 820871 +oid sha256:c3a0ddff4f5198a60c6d2e885c07f274e207dcbadf309e830c529162815dc540 +size 830739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 89daf00798..e0433cd73d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92d5c9b19fd1564e7c54c728bac60b2a5401154338d731d5e3e504bef75b060a -size 799209 +oid sha256:bcdc93e66d1f6113a1aa5f934095d37b1d01b7e215c88d4e1a007c133df98f50 +size 809125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 81481abb71..333e51eda5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22862dd435244b57b2c94fc4924d337da7a9a1b430fbaf081d31d94014f3d52b -size 783605 +oid sha256:5fa714fbb934cf896a30d67954233c9c849798de545ba75777b313f471aaa869 +size 797419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a525d13cb7..61413cfe60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82228c8ea6bf5642e7283745ece61859b33d51ab9bfaa876e0650c2f5cbfe9bd -size 696179 +oid sha256:e5c5b03af73b3c21faa01e30f0a707217e7b0d7a6823880eba46b5b668bf1933 +size 690407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 87d1ffc147..2e6a6a3aa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a620f6ad3dde05e923343c95913792c1e400e53c2b3ff73f41d9c49d35752d93 -size 806615 +oid sha256:2453f6dd198dbf26dd952203327ad4de6eb5bbc4f73639cd11c732e41d585927 +size 808391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3d38f3448c..de1219f7f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1ebfd119f6c1148e4190c4c68f6e2b30134432c9be35be9de9789547189e7e0 -size 784901 +oid sha256:5bf5bfc1c0b8e90213a56562fa1518c6ee404f710dddcbb7efac844132187044 +size 788749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 566f3d2076..7cb802fc5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1008e4ef0483c346d0487f939463ffe6283c877d4fae933b9d93a98a9981b40 -size 763871 +oid sha256:bb5d36a230dc69a3ab0a6c76ffb9a42974d556666e129a4a11da4c332bdf4b5f +size 775367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6e5025c53e..aa10b38aaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8692df9fadf67da428a2b5ea9255e36cfb8d2829c832e4dbcae1acab19adf46 -size 682661 +oid sha256:2ceeccffcdc504554bbaa17e47bbf8d0e628a56df29e552bd95baed66938cd6c +size 668059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aede0dd6e9..be1a1b90ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51bc85f7eae483bc7673bc2873766b291eb16c95dd569c87c447ca668e9e2d40 -size 686731 +oid sha256:04d7f4d31c1c7b937e4848bee26e9632ea501a184b2d91490cf010528e02c057 +size 720129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 67513e2cfb..4fe9faf552 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f0a4fade7aa2fc163ccf0597a209898c06c3b32c148600ae3d47d334e8b1b39 -size 630489 +oid sha256:02c94bb1404549f7050955238ef255df0a8d806a19cb73ec5b0b05cf34de14e7 +size 670793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b7bb689d24..ecc6a0ec68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8f04d088ec735d12dc45d6161036fe52542a5be21877f6c8422a6778bd5a6af -size 675131 +oid sha256:0e3d01d18ce9824db3105c29c92d74e2651e71ca95cbdcb488043dbca6c9cae0 +size 707937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a83f41da5b..dc04f927d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6fbb20ae791c97b28031aa395be207d16bcfba0e4952d5c0f8a1e1c88a6b9c8 -size 625943 +oid sha256:01a2b723e4f5b05571081bf44169609af6f86ce2bfeae993b7782ad7db8b4d1c +size 663437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6de76116e1..62b144bac9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2db11911ed856a28b1c895629c50afd4db0f34369446c94f6c77f7ca18bb6af5 -size 679459 +oid sha256:d9a59d45ed36bfad13d1c713a9a3d2682a34aff4bbc4879835603c7637720ae9 +size 712463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a8433d6fb..7e797dc85c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d135382a6eb2b833a0136d49da2c908a7290792cd8e0ee4d52d713e05b671f9 -size 585470 +oid sha256:fd0bef70c583f00b093097f7e9e0bc772acbbe9bdcf3c6421c4bd9e413d57fe6 +size 609050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f035e58e0d..425bd0da5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1404aa5ad98c8c29b0db4d11034365cc8f052382f4ef6c7b351138a903433ed0 -size 622921 +oid sha256:d1e8749f75a7fe91093e2b23cfd4ca1c6bea3112e19ded2de58ba214190a3c9c +size 648969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 31996b7b3a..e0985d3113 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7189b08eaf48ec01573ef880391453303b44cf4c80d5e8b4cf93758e3e03bb42 -size 540820 +oid sha256:2aa92ef22353b08f9c40af6423a9eae8b34b54c6e141d109a2be26024537047a +size 572592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 78899ade96..9180c386c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:110ae9ad718fc098b2d88a2c08f8054be77b43eaa7a3469987c6a9c3782bfc11 -size 673213 +oid sha256:5ce56060c1acc86f80a15b1b88f801c26f0660969000de1c13dc5341ad269fb4 +size 696301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5f8bcc64b..471635c664 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ae5264628f02a674e610c6b919c31101274beb4227a9e2dc46f3b686dbf241f -size 616180 +oid sha256:1232e1c836b70495f12a6e042e019259e1ea970bd99a8147d3f26a6f42550a43 +size 646965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e2d3b43325..f07da102cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e824d562e966c1f8e0bb5b4d0002348a1299d3dff20f6d41d5ce5e25a13a7b1 -size 660825 +oid sha256:8e6db23c25ab12fdafcf24e0b9fa4558a7b4ffcff84cc16db729b29e5cc11a3a +size 685985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 12790a6998..316116de42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:293d2178b72babf30a68b82377521b15ade4f9a0a7ead43bbb054dfbd2981bc1 -size 611636 +oid sha256:1c18cde0fefca0cae7f989a84725dd1a302a7d6b7112c318c6872b962f4679c0 +size 641533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 376358c8c8..968149c06e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d568ae129104e1e7772b5e8c422021537b80d9982b0b17e611d305a01518e03 -size 663573 +oid sha256:925aec9a4f3900cfcd305c0be3ef7425e7c1937f2f51c34ecaa53e662834f820 +size 688535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9edbaa669a..1509e15842 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:223a2398279407c30add5686fd176cdcd0c5b7d8b1afa344c5f689ff1fa66bb6 -size 571212 +oid sha256:066af1890d2f1c9051e3a4f394d93634131fddbd0ef0ff8fd7b1cb0acba40ffc +size 587738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7c362a6d38..dff14f3771 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9968ff52f92cad9ea24d3789cb795f86363abf7d7505e4e5f1456258b9aaf3ce -size 608514 +oid sha256:ccde9873509a916ff539272c519606885891008bc9e45183ea7ba3b976605394 +size 629531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d3709c30f1..06d4546f1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4168edd7702ec6d6e205ef081082d94453b35559e121cb23a1aa90b77668cc2a -size 527352 +oid sha256:ecfc83d1b936f7fffd6ec489e14c9d597e4fe4ca35e635de895a18617f7e2e38 +size 551378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c138b73661..f3e0532a22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b126efe191d7b5c7a3078ddca9fd87728482caba0f2f8ab3d353be4fc32c825b -size 707287 +oid sha256:a87f0313e152783ecbf564ee8deaa068831c1ca3ed71b673aa355c112962bf11 +size 737725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 237f6eac88..1df8406a8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46f76b30781115588c9523ba31fd44e01491c32e4953a19347825171fc9d9ff1 -size 638811 +oid sha256:f0172182d582bc23d86273e35cc6250f5b95835ee4f19ea440fbeb980ea573b1 +size 688391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a183053a4a..ecb6a7160c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eacea3e7697ae20da236fbe332662d0efcdf1b39d2f75711a39a8c247db4aa29 -size 695737 +oid sha256:578ca0933db3bd34cdf34ea682c5485d4d397ba99d0b068549fb16878a4459e9 +size 725535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3bd1d37bbe..1abb3e6562 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcc0b19a4e3a42f378012f89226463fca87e3c4c7e3ee5115e652f34fc635ccf -size 644133 +oid sha256:41cb43265d3fd0f93fa50f726d4196cfc9545bcff142c28731d99ccd470ff9b2 +size 680245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4bedf58271..1dee745481 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da6beaff163f39b74f77af5c47d4e2082cb30272acdd6eb760c0f67b49c8cc69 -size 697943 +oid sha256:3236ca0a366fb502b4a1f0e1b15d18735ed5366ebdc1334b12576e571b316f73 +size 731293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 69c8d98ff6..04548a5b2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54c4516bcc967df3323352221729eeb482ea597e0558890a25812b6aaa4ce3a8 -size 590584 +oid sha256:68fe61c6522692d991fd44ca4c7aced8e37090940a7c555700940778e6bb0628 +size 609824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2a6b6d0ed3..64d3c7a07e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be7a91448cfb8e66411d087c005d175f6d6d848957b2055b54967113690593dd -size 649447 +oid sha256:9f12491bb7366c88eecd5eb9f33f2b82a14433d56807a80fe198234b451190ba +size 689603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0036e8f20c..4b4f547cef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f947d8f59963a443e2257349cd30396dbf0e912bae1b1180d8bcb4c0878e2cbe -size 546182 +oid sha256:b9e6fa38564b2ff6de3244947430f236ef65653fd30f78e1ac657e7b50cd3593 +size 572034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 587c447240..0220f525cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5bd0f7fefab8ce68cb8bf6d9c72f1b293604b9d1f6262b199483f314e4b2c80 -size 693819 +oid sha256:1cc959158d2a72aed3cfa879e983ac967c906be1045ce12210d5d2c568d1b9b7 +size 713897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6136ce8658..6313457756 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1846538aa1fe066ef381b2919534ffaf85f239750f15bcfc082169d18c0ec315 -size 624503 +oid sha256:13de13d1a59921a3a748cfdd29a0cb0a5a27594502d52451b04d3b432d61c882 +size 664563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e4ad0ca409..eef89ca94c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59e0e31ad4a137fedd8d4da913cd5d7ed6984e8acf95ccef06e2c0119b29a435 -size 681431 +oid sha256:3fbc701754953618198d6ab34f63152b11a5d867ddb643a9e5265df0115fa89d +size 704371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d2e2e4e6c3..d86f69b931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af09754c20a9aa409032859dfa3af389e5d197935b1a13a2e63a013fab8c6b01 -size 630615 +oid sha256:49691c424ee4184a4b058e3d647dd381805b946146024d7f8c6c378ec702cbca +size 658341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f3fabe5c21..c1755bbc56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eddc31247d4dcc263ee67edfb1c73104e0dffd3e6dd4fb87f60569d9443c5d5 -size 682057 +oid sha256:826f0433392b416a6d870f855a6144febfdc6958a0da2238ead0c47f84591572 +size 706477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0bed3bfa35..67ef5a5ddb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e7b442e52318b1c1ea6d57f14da669de251ef98d6407fa8b2a6616af6bf35d1 -size 576328 +oid sha256:1eebbe8dcf3f34a6c04481a94ad2eb99d2260d231f6a920d56679dba2e1cba1a +size 590486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index eaa16ec836..9a24b5217e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7172c73e8cd5384569de1cb0e8d0c9b0f728fd0e9d34f0baf1fe37c5576ec54d -size 634153 +oid sha256:f62822efb3c52b61fd801409dab10ea5989b5c5f7211e8d52546de34cf2a2293 +size 664789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 688f5035af..ceffeb5074 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b39a5b2be900dab8d99d06aa7ccbb26111cf5809f1f5b17da56dd84d32265eb -size 531926 +oid sha256:0934f50559a9e80186b166b89571413a6874e02564513a436c9cd5c4a031c04c +size 552646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index efc137919b..193fd0a4e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c2f110798e535b6f0ba2fdd2fc1e1077abf07a242811690093da2d84477479e -size 825653 +oid sha256:bfecc285df23382946e5ac1dfe62468ecc774d6bb86e0adf43b9576e5ede1e19 +size 888257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2da76f6e3f..ba452ac4ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f71b960570f0a9e53c71844765e2fe4ae3b88e265e75858bda1097ee261b223 -size 773407 +oid sha256:8dfe04cd031f67a7f8ca274ff22c3b0438bf08f8816b82534fa055150b8dece5 +size 844793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 02590d0743..2402a48140 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:470df6965ee064d542b4a4e27177cff5025f3ba28608e038800506fc419a3fbf -size 808281 +oid sha256:6362f24b36c33b718dd12934bf23de4cfc376bed4bab8898df963eaaa1a7f26a +size 872217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3ece2ed9fc..b6b1c51b44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a45307987aef716eb32768460567690ad8ed191b99e338a0578393a8640b2653 -size 764127 +oid sha256:4916ef2c6253e0c93c4e52782811e8a908ccf7d77077d2f329457a9a7a40b834 +size 834871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7157ad4a70..978d36e08a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a8f3d551e8ebaab85066ba881ca61d277352e03c7b8a5f908f4e617c37f701b -size 795885 +oid sha256:f6f4761a96182e749b9f35e07d3073eb536a1a799de05656cbebfa2c21420bf4 +size 862781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2908e3c5f..5c094299a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d10a67e8d28b02485f0fbe0b88a9cc0b0ac22c3fe10cf33d86373fe8394ddb10 -size 720891 +oid sha256:0e002655b3e8bea12868aff9a1e696d36e6e95cbefb9c87acee03b93714cff43 +size 775355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1c3b543c52..af3598132e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d94a82872bd8bac6e6abb679f8a85510283c63423c0025611cad6f7402dd98fb -size 735153 +oid sha256:2bc2a1441b8d9a5828564258a9f1a27d179e70a6cf2de9bd94ca8eb00c20daf8 +size 794501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c6f296c0cd..0fe925e20d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c35053905488285a49275c59d6fe99af1bc90504ba85a1c9046497334b5808ad -size 673627 +oid sha256:e67d3bd9c5c2afd295de4ee39d3d69918dbb2a79b3259e909898e452811b1d2b +size 736577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 36c04d43b2..64ad58024f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e406217e08766791c90a8e63ae4e929621480f15a9167529d71904aea92dbdd -size 798667 +oid sha256:da682dc55e66c639e7455d622059b04e5d8ea7b9247a30903b034d1c0df85016 +size 841983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index efbbe7f7db..baed1c5839 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:119191ef80ca9b78ac32ddbeeb3acfe9beebd05c8439bbf1387496c5732c9d24 -size 746421 +oid sha256:d1cd0bfdd83455ebf4c4ecc21083d8430be1c2b94a501fdd3fd9c564e8394dc6 +size 798517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 326ae4e0d9..28cab10afa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2cad993f78e4b88c9ad6157c18fa65f845542861e6d8cb8bafb697904af848f -size 781297 +oid sha256:b964cbdc534d90aeaa5f5f4bff7085bc7af63f00fd4e7871af1c6d25361d6cf9 +size 827521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 80c7c99cee..d61cc72f38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38333c6033a6981223067fef012776695117e470922b64594599dc5e75120dea -size 736303 +oid sha256:574058617ad9f9537f0da060f23bc683b14db8bfaa3b86806c49f295b5a9a0f9 +size 789385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f0267541b6..4dca604dbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3ba1b72bd40b0e86ed0f210ff06d19825c0455a38b1a680a27da363a79d8178 -size 768061 +oid sha256:2d759baa351e49d87e500d83023192cbebe3fa66e008c6f4dd4cfff18c2d787a +size 815717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 95e41d7903..c9051e3a7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3ad0b6ce3e8db3147a7479cc0a05cb061c2c3e3381b4ef168075092588e0e75 -size 693905 +oid sha256:9dd7fb0c14ca57bda2c36a066befd31b9145c03b8b199d824378137d1ace48db +size 733421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c00ca4d00b..e3144bb05b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b47c44d0eb300dcab8a1a53a8b26608c24da7c6fcebfdfc918e89a70e9ad17a3 -size 706639 +oid sha256:684f73257de18e6ff86cb244de033627c976eae1619d8a73d1e1d62673df5390 +size 750891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f637fe9ba3..a8abfe6014 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1b183b835f423cd9f3fc4c6802ff95c5b7c83743a99ade2c0ffeb3e2d62d1df -size 646641 +oid sha256:b9bf3804016d30cfe6f24f1e23a930da381c2528461c6fddda34eb7d30289811 +size 694643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1f8c185299..0e2e162ab2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b624f2046f2686f0bf0f60034092f1ff973e122805c9c528fefd13c3ae6df35 -size 846111 +oid sha256:70671971f8243c519bbe07cc8c211689019636fff270c2ec775bfcbf40db27c6 +size 908665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 258d26fc38..0d79dd8db4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93213c5ac1271b1a7164e517eeff3b53c1b9d6e7d22582a56e65ae275d8c6bc9 -size 793815 +oid sha256:d0a72b7f50721e08936db525d812a300e412e098d580e02d1900189330e9feff +size 865151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8efd109df3..6dddee56e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:197e0a3185839c6570259dd3d5141a652f4f2fb3df99c7a4c6405856b5bff9a9 -size 828739 +oid sha256:cf22dbcd0df56f27a2e78beefcac885afd3fbc43029e22e901c1f0e84f1d5536 +size 890899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c2b84a4ca4..2db108924f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59a819e88a133dd5cb54572dd1449e9def11dc21d51effec3cf2cc876d8c16a2 -size 782957 +oid sha256:882f2a51f0a4aea60a8a2c2ec34f610bb8c45fc1c552b13b94a5b12d13cc554c +size 852763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 671f29b834..ef253ee1fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ac6859db529c4e5c7528ece42caad5983166f002d8a564d3824884bee5d5890 -size 813925 +oid sha256:1d76d04686f90f229ade312e48dcab1b80e12f0556490bff6a6ccefa280a08a7 +size 879095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 745d3bf290..3c567b597e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fe3e503b72880912b193927e1d673fad41ab83a0abc2e5e1d9099ee7b179f35 -size 703855 +oid sha256:0f7c11a2e0efbb8211bda10b5dbbc777831d0706b519c27ddc338d3cecc6efe4 +size 752547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0ce2e29797..8ac9ba142b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a22ec965890342fdb9206f376e0f101da89ac098a2a3e1888b2da6cd9125c7a0 -size 761383 +oid sha256:864fa8b8cec0f6ad6569a50288d9e7704c069b781bc00dcb813c5ea623048ba7 +size 830451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 998ac6f281..e6eb1f726e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2ccfc4f06cb14f8c0f03fb1034c59b3e4428726f3fdd8472b97836e969ee10c -size 656839 +oid sha256:892e186d86a00c3132c2c5eaf6d2a1e780aaeb2c8c2f5a630c788ce49b172fc5 +size 712289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec28c70eaa..5a2427725f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:523016993ff35d11f9749af9a37d6ab62012551fb6ae6319aa9eafc98602edee -size 819125 +oid sha256:fb0b2916e1ccd7996512dee263579624d7aeaafaa16e518b81a188f552348b71 +size 861157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c6661c98f5..d20c0337dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f66f27eef15aac89686a13a2d606e98fcdf3925dae7159bcc00a9f146f561b06 -size 766041 +oid sha256:cd3335ce98c6fd4a6ea6fc65b263f114e6ca46bcd1d88ad6146a32230cd059d2 +size 816903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2b6529dd70..6f4b798e0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba8bcdaa3f95a72380e81ce13d692da2245634d717c018e3de66f7fd4e34fb3e -size 801755 +oid sha256:28eb0cc6717ebe3d639746d57ba3e60222bcbf9104051338dd6bd3d1839771d1 +size 845957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7e715b0aa..0454548f6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbf9de7ab4013bb7d906f880542d8deb9ea4cbd30a1f70a0403f977a89f8327b -size 755181 +oid sha256:49ac014992d2b2fc4cc39a99a62480d5306102c5969240efa8c2f5fa156bbb51 +size 806981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e21b4c1633..efd2747d92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bfee67ae4e502db41688efbea3eb308c89e30b653bfb26704e388c9911729b0 -size 785361 +oid sha256:a57671736ab7ecea0ad2b703d323cbb1667ac5b2158e318d684339d503c274a9 +size 833955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 221c7cfb94..65c36a3fa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42b4f0a6ce12959d80af95187aaaf98a1a1382c659362ace9701fcfcd2648199 -size 676869 +oid sha256:d40e3a54bcaa9a27c2a37807cec45117386bef64ba001f48f7d09ff6dab7f682 +size 709133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 226f4f0f53..50bf235290 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:494f5c4919ce3a204e73d779b5686fc5a4f8b05fa3ba2e60dda408dfc30fb50e -size 733609 +oid sha256:095b0a13c4063e30415994eeecf194d747b8de768c44c0ff515096adf7cb5602 +size 786099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d04ac1d0dd..79f4482040 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:857169aea083da08b3c84f3db0f04f8bdbae49b7e50ce5fe8889ceb72d8332d3 -size 629853 +oid sha256:15c75af0f7ebecf327c41eaa9032199ef032b0308fa531d3895028f542b480cd +size 668925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9f7225da73..53eafbee87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:352d16a20f345d94366807b1405fa065bb7cd0a99ad19ad00675da72a1fa6bed -size 724621 +oid sha256:a9ba40e4e98ed576fa33ea9e37e625ecd7e87f8d93bfeb99150e783558dc8369 +size 752889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b8106cae86..74263f6a65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99262d5195418ee75019e7466045b714b1357e97591001adb91c3555ad78b57d -size 710209 +oid sha256:a97a189a2d6dd9f62171554abfa527656bf469967bd26154c705c5f73c1e7bfe +size 737047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7c9bd6d369..4be89086aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fbae733846c3e2e5bd892ed09c2b9b361e589f3eb19f6bf0fa1164ace24416d -size 698503 +oid sha256:2ff232ca2b6443aeea7fe966b69f3b45e1273817f736d2578d4805f060b5679c +size 718681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d2773efe8c..548189048f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d65ecc5856e6005571abc9125fc3ec17d084f1c1aaf9e0bd130d2f6820da1f5 -size 611816 +oid sha256:25010322b5cae00530c23ffd2e26a5a2a47ce6a1581a4af27468490ccd73a662 +size 630267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a823b2e4e0..831cdf93d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c308aaea284568ae560f450666a84d1bf8f54a4f3d35e4b0c1eff0c866a11068 -size 717467 +oid sha256:b3b15eeb4b95487408daccd1f4ddcbf17a14dc6793fc16b0937926dc174005cb +size 743219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d5da7ab2c6..cda1d896ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:470109b7aa0abca340ebb6ff41071ee7db42512f208139d1e7a8eeb563e5ffb9 -size 703055 +oid sha256:a4510b8f460d2bc913aa1899c70a7187226b46a9433acb7488e5c73550529479 +size 728117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 3cacc47b0a..6dc506edf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:add0d027b6f9078d457e381f4f0b99206341e5f195a29b74586d6467c7ca11bf -size 691349 +oid sha256:13f4c8539bc06d0e55aef8edd70ede0e5c8c62d1ffaf54cda3a504250a1e206b +size 708913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 192990f2be..7af9744b31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f30f4f54f651f3473f43a6e819f25127d96a645daa75f6e6a2c5583ada0aca8d -size 604712 +oid sha256:6f8ed68eb4c0f25a9134c9ae2289d4512febd7ebefced8504b4ea18914240e4c +size 620597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e838f6b8a5..19ae4e3483 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80852fe45af456df4c80f55d3127c05b404d07a6dd0e31fa3e00eda3812f8911 -size 640453 +oid sha256:477207b553d9dce4aa53255a622e1a3cb222bfaacf488284e2b9c42fea0a54c4 +size 658855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c92f622ffe..0e424568a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:406ff311476c226cd1b189fa45f4a852e5f372ef7bd6a39b4053a4fd5d93096c -size 605424 +oid sha256:78a0c4a076bf454912412aebad04509c192cc5d455f363d288a866f3c7c058ec +size 632015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3ab47a9d4a..30c19a8e09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7924d389bc6b7a31006f0f1bb4b1febc4692464a1da4901bf6d39b85ac14393 -size 632307 +oid sha256:a7fd8d70e2f39c97373aceef047157a21552bd6fd798820c1cea329a29cfacb6 +size 651795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e2c8bdc9b9..11cc493831 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e578707810b787fe06c3e99a2268ea521029e985da56575b8ad2c4ecdd463f09 -size 600780 +oid sha256:cfe21fb8ab480aaccacd6ddf01fbe9783d067dc7c752033722546cfa0b5b0abe +size 625745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 75c156f74a..f2572bc661 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb64ddd2332e566a56c1786455978c5b12f5f3771fa21a95260e119a45ed1785 -size 635303 +oid sha256:63b49f34a06fcb15d0b955cfe3d0d873b008bc9ee4c613ec8457c368b59302c8 +size 644973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eca6369b42..86d13285b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98c3759af7723ef95cd024bea3fbd2f6e41e9e6c1678f5467ff18fb8b9e41b6d -size 552858 +oid sha256:1f01608784b1e8ea6130dcb3adf0ad739a88d6c89f9a4f24bf66c42744841455 +size 564896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3563e68ad1..28829e9083 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:137eb9166beb993ed9644c01b2ed371e65a9820469e0c7901e07b8065b984f53 -size 599928 +oid sha256:fe913dad89d5dd2b8bdc8e1bd6c00a3f1138af5c1587c233781035d4ce73d6ac +size 616110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cf60461e6a..a1f9467310 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68fcf5dbc424f84729fe8b43b769d305907d7143df6494ccade843e13456faec -size 515954 +oid sha256:4de0db76c96a7f10bf68db18ffbfb292a6c9b669b4eb714cce53641ffcd289c7 +size 535688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 695cfc9e18..487bf77f29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8702d02b630faa580e4fc8bb921653d954762bb78351063651c281b8a815122b -size 633301 +oid sha256:b04b3409eb7f252cb6d6ed9590eab297ca1b13b03e991faa0887df066c1c80ce +size 649975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 61b5e3b2c2..9c54e7c75a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:711f7867387d11fd03542977e1970b689171a5ad489b71709fa98b6200594cf2 -size 598270 +oid sha256:75158927e5927457b347341ad88bc31adfd4773e8c3582b6efeaf399a51e6336 +size 622347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 89fc071624..47d63a3c47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2edbb52d3341a5338d96db9e882419c2a5d763a337c3aedd935d7c4be0fa5f1a -size 625155 +oid sha256:814685849d9a48d5f1a4a589d36cc75fcf7adf2b4b79bb7ae865b85fe4d4b38f +size 642915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e410da8e4..7d9fc7bfa9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bcf55bb363d826e360b31711b87a486e6eb09efb89e04b56daff11866876308 -size 592838 +oid sha256:12f29202d2a2b95b9473a846f1a609c78893d14855a154fc72c84875ec34117a +size 616074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a9939ec128..e0c4e28a8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d85d81d9bfa4631fc99908b786f0bd2b1b8c675964255e7860df88a8d31cfe1d -size 628249 +oid sha256:433bdc4e2539804091270d09d2b533807a888e08ee9ae477e8a073626fb0c20f +size 635105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b386838b74..a1a9c9060e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e03146d0d9859dee2d02d73b7e1c8ed55215363fab3b311e61516e7db074eda -size 545704 +oid sha256:f87fc2f78efd8e59acee8255a9dd3aed6fb069d66224342da4fc4bb392ac40d1 +size 555176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 11f3d6a419..206ef47f6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb443412df781d1f0e7265be77a178e2938753a08b948ef428103b78dd743a56 -size 592182 +oid sha256:59470cd6e403414e4c000bfb16a36973c2b9f744af6def5033b9626498d9a37e +size 605700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4eada4c349..337152c0c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f48f135682493c5f6dd065a3191fd55138fef7ca72a5da604c92184834898972 -size 508850 +oid sha256:65ad3a7008935c30f2cc1826b60f021c1a75729e1d1be988514664ccbd073fcc +size 526018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ff0ac1ca3a..0d8b7f6519 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73d4b33fe4869379c0f570c4bdeb35c4aa11715249921c53f1f174d6ecb92673 -size 660517 +oid sha256:1177d1ef90371ad7d4c7105c3f91393b37f8bc51ca3aadf6cb556321765593a6 +size 677241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 769cac551d..32d6f0edff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20827fcf42378398b18fb617d269f17dda13f72997101cd5b26c897ef97ee29b -size 614042 +oid sha256:5d7addfe015a921b7b135a19b7958b90f3fe7b9ac058f8a568cf84f5767afd71 +size 649613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 78d66842e9..64bacec598 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb7abb70ea711647f3addfd25a60253253424b3946f2d52914c9f13f36365f40 -size 652913 +oid sha256:fd5ef156281b5b4712f73da8a423e22d400d9a8c13c27ef99ae110ef986650f9 +size 669391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 53dc008762..eb5087eafe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ba1baa143e38be446ba8a67b609c1075da2219fa8d1eb77217693123be4a417 -size 618971 +oid sha256:a9a92601faaa4bf7d916eeeaf54a6bfe5318ab08916924a4c6b5256b0fa499c4 +size 642551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index deee2b15a9..0f5657d6a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f59d0d37a18c7ff259f2d06688e0ff73c35d296d17ec4318f2baf1ea97e6ca9 -size 662619 +oid sha256:20b0cfb01466f634ab9db008a450aa1fc71744254e5bca6243eb5bc46a1becba +size 682499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2405c35489..8699c0345d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5710db6f07261a0241a5ac8e2f4508000e78bd8e00e1c908d16c8ff81450e5cc -size 565028 +oid sha256:b2ff473066015e89aa758e3b35cc3dc5c78f77c63390dca572e284ae918a836f +size 575240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 62b937df73..214c486879 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5647a6d241df3e563714a37a28df19e9e2f8508615d0338cff42f95b1dda771f -size 622015 +oid sha256:261eac13a8c438674d6a101c8d6c748444589d959595dbe149f930f67628621c +size 646977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 411c46802a..a5f90d025b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3480af5590d98fd7ed24f2dc7eedda893642ff9b0d69ea767aa1f070601d3fb -size 528420 +oid sha256:7f1769c2ef6c8fca88e8b82436f15ec19d5becb869c3e39ed53bd80448c4142a +size 545392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0dd4c9f0a2..5a66b7998b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27671bec402897b1d9f33ee3a6ee8326d3de1e73ca9bd18bbaf924c3ea1beadf -size 653955 +oid sha256:3a50d993c018cbbef5a8ac7c56b47d62d0c62fbba5f607d15f82e3db71a4e194 +size 667571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ea3799d91c..960653bf88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a451cf4a80fe2b284728c9669038dd86bc89fc933d6364687b2fcb9f61538ac1 -size 606888 +oid sha256:0bb900edb64372ab8e23aa89d75ed446fbd2a0781f8c81959e2c56fb98f67890 +size 639943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a3f392b32..c76a43a4e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:914454e591056af60ac7d2b038e60e2a52da6f56ddd3c706bdd25ee04a0924f0 -size 645761 +oid sha256:dffc3d872f1526d6abe2baba93daf1bad19a69dc50d80d816fd158ff3e24b5e7 +size 660511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 768d0fa115..63e1f52a37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b3373635b14a3f4679d2b10fef2645c62b18e2e0c4403ceffde7233c8f81b4e -size 611816 +oid sha256:62be89332c41ad57c29a07b14f4af747c61512e1547665b0ee0a189ebf98baef +size 632883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index abfcc526a6..8051018692 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec14339bca35e3b5a39850e5025e6730725d3dbd2cb3facbb7d1d6e0effe6252 -size 655465 +oid sha256:e923fd43d9e1cd7331a6be0047fac9279fd9788f16a6017774391943ebb8f475 +size 672781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 417615abb0..2b547e269f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffbb6861b6106d59a800e5e2a8a8fb836c4537c7b874132e8a866bec614dffab -size 557874 +oid sha256:ba87cce796d0dac9938f5732c5e62af73faee609f6f2b8814182bcaa6bcc621a +size 566360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 337dc2d90b..1f29ace311 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0da65c4fcc69e153dac89cffabe30523f33aad45a65524b14caecb0bbad00ee9 -size 614860 +oid sha256:97dd54633922ce67482169a1cc721a96cd3e6e6c1cf6811445741849feb4a0a4 +size 637259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 495c9dccc8..c0d7121696 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e676d5ffa5ca4355a1ef03af5df465f0e1d9ce5ed4ea5e01a1106870e263133 -size 521268 +oid sha256:349deff3477efc8aaf3f5e61af1460e17df422d9d9e9ac64a3eed9666644c43d +size 536512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d9941ef7a8..4c0c1823d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe028b3a6af8fccf565dce7947e6d09d3799b093fe1057f782c45b6928fb289d -size 942659 +oid sha256:5db47bcbabd446b565fda7cddd02b4b7037dd1e3edad4518bd545c213edb5d7b +size 1042855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 33b4439267..60c9bb0788 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b19a442c86219ebe7e80eb878b28947508553c24a4cb2e26ce071b5f929d43a -size 880249 +oid sha256:bcbdd5ef8e5f71245226ad567f8516fdf8aa301ba9c53a214b0232d55de0ede2 +size 935553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a0917ff39e..b977e14ac0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e97b68638adfd4ceb33a491b3366e59f69914d418a78b19aff2754a43a015043 -size 822483 +oid sha256:4faf78c7b63ed1645358bfc1b7b48aabadabc826fe96b8199c7316c0050cb634 +size 870483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4a3a3ee724..4dd3c2ea16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8a2c6f9954d54d51d5533e6df846c15df46102da68ed76d8921bee7ccef69bd -size 887403 +oid sha256:9dfdb6ab2ee98434963380fc8dd20825aeaa8fefd16d14f6b31c2daf4ecb4b3a +size 991151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0ed40c67d9..50968c9e2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d65cfb44b517409a6d45c01f886655f1a145636cbde220f49e725f25df0b589 -size 829237 +oid sha256:2b2ba37633ea798ddcde9d0e472af1396f85466958f9fdd247e75c24c352cd70 +size 888141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bd47585e1e..4ecc405cb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c4f752ae5e0913448212e20fa410086b335b3acf295e41b3422a8dabb411900 -size 716227 +oid sha256:f162fe155ea9153dc5eab26304930a592d813716dd65ba50255c6ea8e08c041c +size 767187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0453b89d8b..ad174390a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18cd549300a72c9410e3d95f47e59efeb7e411282587a0e013c6367336a9f897 -size 931849 +oid sha256:b3de654fa08b5994a20097bd020bbe45698c868b331be5ca119fb9dd3b4c6d18 +size 1032685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3e84ef5a17..d3dab98537 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08b51e3f259a990081a444b330cdeedff8a2b7d11b598dc8cd4bdb40594b6c0d -size 870131 +oid sha256:7736fcddb2a8841b47c612b56c857ee9dd264cbb672e59c0f34a770353f5e1c4 +size 925581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a3e741175d..1dce521d48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89f3c25d4f3fc1e987c635df965f67ad8636a9c783220d9de49d708bc78a2ac0 -size 883055 +oid sha256:0f8d0720fef988bd45ee8799992ad39cef0d7e74ec63b629981782d4a4544500 +size 985767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 57e70c3dba..3cee3696bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5904a99fbcf31df8096c1372a32b3081ccf47b2fd7e4413c408c13da56faad2b -size 824741 +oid sha256:96859d5f935bf32a3d94471ceee0c4bdeb686abc158001b2563aaf9a2c66b76a +size 882757 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 568e8943cb..77ef04f8dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d625e5e099820faaa9075a6c5e554642779acef7144b746ea4b5982af1d621f6 -size 938347 +oid sha256:dddc4c197c4aebb7e71e96392963fc3a98486e5dd5cdbdcb0632cf191917498a +size 1032179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1c0354c3f2..a877428523 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae21d77e9a9208d8ca2777fec8fb4fcc9a06da732191c5052d638ce036078626 -size 846085 +oid sha256:000a058e44e1763710395560d093c20b02853ce92b831c339f23055e8d15a4d4 +size 948353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index f68bc21fd2..5909e0fbcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd805c77605ee9bbda32f360fd1859ca8c47f6827dbeffd42241113dc80e85a8 -size 879489 +oid sha256:a03f20ea92854f7473b36727ceb794936c5ecdb4cc027f21a1ce46081feab721 +size 938147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3b73eda1d6..08ecdf4d25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d1bc51147d2294217adcd16787541a3294d4084b3842815a3c4c85154f82aba -size 782493 +oid sha256:564873167b2c19f05e21cdd1fe28503664445f6a65a6ed8c93b774e13658ce36 +size 838337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index dccaae2f0f..504bc6e460 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0484e9e6cec79f824ddfb3b7fe5960a6073e999710168cb3ff3abd5ce2dcf6b1 -size 733613 +oid sha256:8f101743a8ebec62b56d207844b425baa33a7572db719165dbb4640c7156765a +size 810869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3cc0cd813d..691062d731 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05cf6dc2b8d418ed59bfae243ba0865acdc46435d1b87aa744ac40b4880db609 -size 680227 +oid sha256:8fc6f99f9fe4ede5bc30d4d292abb7cc4e3d8842279412e6fc069e42b1ff39f6 +size 729955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0dd209bf69..ddbfa8533d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f923600fa9efe870f6c373d6b05dca34db03e4c2cdb8be085c313a242e99fc1d -size 892809 +oid sha256:374b263e00162d5900b1046ffcbb9ca47a68a0c8d012db058f96f2b139f6141b +size 989453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 719a1a3511..f95e85bc7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5fa84ec1f010417a78bf91daa64db340e87be783454caf2a00a1d7fc08b8d1c -size 801929 +oid sha256:bc36a9899dac3fa4a32a77a01eb7fbde670b5d175f7970aebd1eb7e00163b5bd +size 906221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index c1ce9a8b98..b94ed034e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13dbb0251150cdd248a0dbc038aa2d7a227f86c952f653971fb59597eaec5fdd -size 837555 +oid sha256:a14f861599ead570bbe2794e97109168e3bbcdfa258d15d8749f3643c9daa4a3 +size 900701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 320cd75b04..ef8407f71c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51cf090f8f68366fe592ebe85d19890954a65a4489ae9b7d4dd40b3a91add4b2 -size 743615 +oid sha256:c5cf9568ecbdd6fb51c2ea9b4e5da88b90346530ba61df71be2eb58f2eeb4a97 +size 801435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c69e588cc..45dbf8df3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a53e39f4edb4e8216ab53e24480650c3d38b24667f02e0285b65d4bffbe095d -size 896779 +oid sha256:bcf13406ad96bdee7e61a29cfc13980b00402f020fa3705f1d7fb1f010ef0c98 +size 967079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a13d8af37d..de5fd89198 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0352663c468e1d9a8bedee73a043ab97127b3960f066893dc0ec6705f417d781 -size 858839 +oid sha256:982155bdacf104a28e1f338192194a4e063f5e493e8b238732ac15cd3a1621a2 +size 903485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index aff9361ba9..8461970a6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed0b1cfcd2f78706036633436ca7988338db572570be87fb4e5a4f7837db00c1 -size 763923 +oid sha256:0530357b88aee629deb56a8a62c4c7f1f7f523080906a5b9710647ed4cdd2630 +size 795349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 677fc169fe..7cdfcb607e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fc75ac4286faaccbf977da09dbaa83531d0a2094986a6ffdf792f8fbabf71d3 -size 841573 +oid sha256:fe3f076203432fc19f2a1ed26279e41ffaba9b0ef3b432cc1b02eb0bde024cc5 +size 917003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 626e9864a3..d7dfa0a0c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3fdb9672faf29aefa6ec0038af9d3da862b2f49d90895c080a2e5c6e9796305 -size 807827 +oid sha256:554a3d8fb74f41bd05d557973c8d35ec119e8a264fd12b61b256bb4ba84a2f77 +size 856863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 216864fc2a..96c7340f23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03496f7bb0337bf8a593a4bb2c2c1efd36a06dd35efda6dd85b4b9d9cf138b93 -size 658507 +oid sha256:a13d66f13609902c4934d69dc6674b0a749b9a452d2e1e34712fa1e389d9a7a5 +size 692053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1145df509d..d2c6fcd90b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb04746cf5f5fd78c103f015b0aaa76f430da26a19baa1ed5c2b46eb07ecf32 -size 886757 +oid sha256:f5dd2b937b9463c989cc141d6d8ebc40b6b9b307ab23eb2c1173561b66039bdc +size 957699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 97ed359ff3..89cb55310b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b19b1c8152e6d4d1780036defb657859504843dac0dc86e14d9758d9c47f148f -size 847931 +oid sha256:e83d3c009b01fa2de03cf53f4f48ba3f6902283a6ef50dfc824f291d20573809 +size 894255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5faaf92596..921d09964f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:064ccd24b5ad5f35582d5a1abd5518744decaa2d572344aae8a225cbd6c0e278 -size 837175 +oid sha256:2929ee098bb3a4ffcff789e463f8a72f9905ff7345984a1d0469fb1ae2efef14 +size 910831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 89538bc601..9f88f0202b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9458cc6e096af4e7e2d7182e2f46f41c171a2fa0317c80e042d8303db123699f -size 802541 +oid sha256:faeba33f0285a5a5cd79ec83b576f598f3279d1082780dc71f8c3f5e1c79b8f5 +size 850691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b2320a14f4..dcaa6dacd9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8571c536336c45c6de2ef334f3e30576b988911e430bf068a8d7217765a1abc8 -size 892269 +oid sha256:5243433c1348462b4a1dab906c0db5462e3ad3c4568ccd102c4a9a921b145b21 +size 954775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 337d9a59eb..342c87b37a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:721efc372533fd2b6278a093890b002e6962d56abe3eac1bd159570b02acb139 -size 800205 +oid sha256:474f90db518355471fc4ebe0c408cfba9376f417180db93bb9801341a4bbb316 +size 872627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 7e4d18539c..ea339d2fa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57a55fdcc987b0716081b8931e978af5c1d4b1e5f23773804656de53e52db93d -size 859017 +oid sha256:b34ac036e8004df2586df9d897c52854fd745428010dabcbfa3a5e11a4874aef +size 906425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a36f987878..24c0fe4f32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:033519dd713aef0d1756f65aa5fcb54d51a83a8ae8f902303d1a9634ab77dcee -size 761871 +oid sha256:de35b0b4bb1375ee7d76fd7191baeb60ca036cee46c4e702beec3c6953b275ae +size 807061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0a41c2c534..2756cd8a2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c7d38e9e6178b26f038f4d739ae1f9f8ee7cbf5038cea339e7166fd365726bd -size 686055 +oid sha256:f39a8a01259acfa656516c689174e6000be9e7de80cbe49de93f91ebcc14088a +size 725177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a41328cea2..701fed3d9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6c662c40574bdf73442da4c33bd256146b73a123f615f2ea44fb22e1ee8d3e4 -size 623345 +oid sha256:ca7e6f972c5f8b517e238c2b081c0978aaa6e60518e3c1d16b9444538d9d4609 +size 654819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2008453ab7..c3313dc0af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61c3cefecae5d478e7817aad00a9a8c6963b08e8a44fa7c34f7f6749b9a2b56d -size 846733 +oid sha256:e21f112834efcb13c0a35f2c68a10acf3509f2976c268e047012749d909e0017 +size 913777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce6338aa85..92016c6bca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55ce71451f6f099bef45c7004a97a4e19d77ee4d4e0593fc1688b04e08cc1e18 -size 756839 +oid sha256:aa72e478290c8adf52dda652b0b946e97661f9992fdb825733c7918b7d50e11f +size 830493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 62a74fd673..0b6ca81b77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbbfb63024cf2d3c2f9b38bbebdcfc5aff0d3e55725d078578c69fdc350ba579 -size 817081 +oid sha256:ec9cb84e8ce622e166ac333cb95b1efb3ae23e32aa363eccb4b445ab7b43f2e7 +size 869177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index b062f19219..10aec5d5d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf17818b5adb8606ba4ed36d2d9aa9fe8f531b3fb014880f4414c8501e6480f9 -size 722995 +oid sha256:21e06839bf6a89babbbbec5bde5010682c2bf01a84aa80fa0f44d86307cc7aa9 +size 770207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 97da365c8f..d31b6f3d66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8b1503a725fe88e97e9c3b70d2bc909582f13ccf58706dc2060e185a0519100 -size 863675 +oid sha256:e391905948485319bb22ab8a4ef26e1b3190493d3811f6eb659544a5b9cc2e10 +size 905905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6d9be1789e..54e9cac6de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43e6bfdecc33f99a61855e0813eb24ec56e8e17a2aed3076f833bdc6e6959dd3 -size 820161 +oid sha256:0c62868bedc567c3dfbfbc8dcd237a6e3d67d26aab7a9e6503eb1707b8bf057c +size 867817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 07cc068514..9397f213d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5593d2b79f41463624c6b88dbd127a4fa84eaf6438d03f6822fcfb0b86c89b30 -size 851681 +oid sha256:665602e673aff456bbaab7cbd97eccc16226e221afd581fb1e8f2ab5933b6c39 +size 895737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index adf3720c59..0e64b0687d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c0dd697c56271e530ffa6e422b16f9ea0a55661902f8076b79de3ed241f0952 -size 817393 +oid sha256:be871e1e70cf7ec4b81e620cb9a722c254de814530a1bf7e79ed9dad752e9b62 +size 862583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4164fd5bcd..27c117a54f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4680ad5aed58e28ccbb603a44de0d02c0ccf3d13b202cd82a5458437c53f228 -size 874805 +oid sha256:679bd90aff1a2ee110391975038d960f7a9ada7e851f11a40a3c6a7285faba4e +size 904355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5c00942031..0c115d6397 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f80efefd06d7f381f0d5da2d6d697362f979dec203436e25b2ef02855a321be -size 773515 +oid sha256:772e6285eb8b09df17d9dc8373f9acb046e51c6451e9f4ff7d345895545d271f +size 813229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 55b247a5d7..a2f4d3fbcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f55374229b18ea317dad845496736323191e7475d08d0b2683b6cd54ad06e46 -size 840121 +oid sha256:fca2356100532d48b2327b411aaada99b32def00dec2ed991e41789d4eccc9ab +size 872731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3fcdfb3335..29c43b5065 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4e3ad59af5543b925da3724f9b8ee6880a4b1dc12b024fea8b6b72d760bf5f0 -size 742631 +oid sha256:4f85b0288c1ea48ff30caf38510986f4ca24ab5421998878a41ce65a83348533 +size 786933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 54eb7ebe89..76d7411aff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:228e9de3cefd7f20beb642d3c64f4cd258a8c94cf8befb6da31291d60eefa28c -size 1018187 +oid sha256:4ddbdc7bc8f94bc35e116e3a4d5d0fdbce0053317be210a0fea9a9d14a12ade0 +size 1144975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3166b667ac..29368ce3c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57210a4c8694b9aa0439e4f09ea4760c1e4d845238bfb8f0748e5ac71518b50d -size 936341 +oid sha256:4d3080aa7824d5dfa59316274bec64b311cb1c6b0d6884a49553d454c644686a +size 1000031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index da5539d1f9..5efa633f2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:688dc964973c7a22253c2db279c4026688f770cc24fe907c46762e81417e7525 -size 939903 +oid sha256:8b99ad67531d24754a8c977d289f0da9037e57de6e6e986cce91a8f7369467f4 +size 995009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fb50362e84..693b998bd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ff98f8384fcfd38bea48b87574e6f8510d827d629b373c609702ea3a82315a3 -size 903339 +oid sha256:4a479356aba4179610d5ab7a8a9d45c82af7f82fcd22a60e1f9afb48b1003346 +size 961059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d195b179ec..4ad5907ffb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02f0e5a45c2e49e345569d258412ab644cb9cd27eca36a836f8a56b61353d236 -size 967569 +oid sha256:10ae58e13ceadf368edae99681aea5f35d0081761408217bd1e90af72ea9765a +size 1099981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 22f16bd64e..fc4104e20d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e247e2511b49350cd1afb6c80b1dd7666cb977ac41f57d571f11fe7425221194 -size 889127 +oid sha256:70ca51dc1138dc02aa2e0836cd207b792b4da2de0ae8fe7f5d42792c6bfef61d +size 958343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 0f2ad13da8..b7a536b5e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:494a456ad4b188ae65f62dbf181755cfe7216f7988af6c678b96647383915805 -size 800693 +oid sha256:3acd72cf9852e07f02c483773fc53f7f59b05dc03a9cc7f111c5d291eecda4ed +size 854811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 521500d141..71293d8127 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92d3e36904912880257ee597f4a3963dc7cc1f3cd2da355e789c3373da1ea155 -size 783567 +oid sha256:5ce995c6932e5b6a3b62146e642e8e8794b94beb92c477c8b07c44d738816164 +size 843605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1e2c970b5..e9e4582dff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0914f1725efcb85979fb8a9b3aa23fac07466e4ee7492a631c1b222ed50ff21f -size 1002395 +oid sha256:2b16925bdcb34bbaab65a6994378420f7ad09e7b42280bf4c2f9566d235486ae +size 1131599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 992de7e087..8dfb02fbcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c1a3e83378fccaec0b141469538a651ef2cffefa0b6176494bf24961fd55a1d -size 920549 +oid sha256:bed6556574ed20c3cbf25abf11533c7db26485fe559a0f0b3f8d3d1d58f99aab +size 985867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8f42bb205f..5c5be45abb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84d5b86b0a8f2aa534cbabd150bb265a8fdb35a7fd258fdb214c9cd10793c769 -size 959177 +oid sha256:26c5e3dfbc6f1ab21940b43aed34aeca7143e9b28749d5d48fdd5a45ba4dcd36 +size 1090305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 13288f268d..ded288b395 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d553923fa36e4dba4e58281cc9f09fad697a76dfcd337ad320eff6f906d7fb20 -size 880735 +oid sha256:ca3183c60b9ec5a7fbf0ba320c5d0f228392bb1ac9bab9eb42b17171b5e62dcc +size 948667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a30002992c..27b287bdf6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1feb489fb1f060a733c7fca3f9d679cd090bbd6310c060f4bde23bc9de62a121 -size 1017231 +oid sha256:dc9c7b3fc28b50eab2fe3d0916360c009d67d8939ff72ebfd25748ece67be5cd +size 1127539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index edc543d22e..968488a086 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79be89fbf3a33d11f661697c6512242ff73fde6ab5275a62ae752f9b866f7ab0 -size 916533 +oid sha256:5031b9181085faee3804193404764b857cd773b54d8c1cde50f94a1d647bcce7 +size 1044701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 461c902864..dbd43367c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a8d0d549c1019fbaa77864884bbc371de396b87e16e62e0d1e308e3c5370718 -size 935137 +oid sha256:fb7ee49e61fe17d7c712927fbdeac492dc2591bee05116ad4c1a9c9d51dcf133 +size 999369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 24d373220b..2d3657b370 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d41301b0fe4f266b5634ab1be1e269466ddcf687333356fdb764df185ade52cd -size 834441 +oid sha256:0f9d0a385a245a414d014760d3913d620ac6452a64add2c27b37ccdde6fa6a7a +size 898573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 2f62e76550..77c10df386 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52e6e75881ea8010d363b36ffed4709b3f0c9e1ce694e7020058eee124aa4767 -size 856215 +oid sha256:67f5eae486525b8da90c126f336ceb4f3712d82a9b08ff97f9637998a887885b +size 914279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index a268957795..b1b6221669 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c2b03bcf69bc882e11d940d3aaac0dae2699ebd6d9c1173c1a02854058132ce -size 805047 +oid sha256:4397188609eb64d0eea2b416a23ebc8cbfb9d278d3fba81f2c3aec275dfef3af +size 893355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 9490ac2de6..e237313a60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a477bfa2eea9b69580bb8a48f241ff7d3dd8798914d3974ab50a3cf0d6ea937 -size 764249 +oid sha256:6b29fa2ee9f3e5b8fd71642caf5c0ef5ee677db0946ad8654d2c122f24214ecd +size 815901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 126b4616c1..c93ac2de30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cc0f4fb0d151f5ed770677ab3f5658f807deb3f5cdc4ee94e44fbb71ca1dd49 -size 747517 +oid sha256:330b6fee1364fa82280a5be6df7761b891e33f0a8916472b7cb727854b85d056 +size 805879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4cf34e99ce..1de69cfdd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed3ed71864c41fc45fcde6d787db6678ce5ff09eb956749a66f381e7b1e7865e -size 964393 +oid sha256:efcd5af6831fc57fcd4f2fb578cbd8dd6dfefb2b5d5d90d3cc0f87a72f5f3eb7 +size 1078105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7c609d8319..0141a17ba4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b0b1430915a7c2398bb05e2ddadb4ec36ec9847f783ca9ebb692f6272a9129c -size 870503 +oid sha256:0aaec4f7da56e70e6237f056c6e8e624ca37eb9ad9972942407d6e200b28a35e +size 999361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index c04d40cdc4..8a2ebbc200 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e00601ebc79c53f6573bcbc3a708344acd2676f171e8f06d5feea3503d66f7a0 -size 885901 +oid sha256:e0a201feeede6a90bb11115d38c36d8c44ed69ef4330c0a09d3f5f8130a1d48f +size 955559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e794ff57ba..e1aad55867 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc6e2408b09d0d5f362534e189f69a6e6e669950ca19b5ce0e3ead8af467314e -size 792851 +oid sha256:00241f8e1d18a622dc132c8a3646443fedee6e86168c18fcfc9e44c9c1cdda95 +size 857575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9da6fc948e..c92f1d050b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfddef269dc67c869f9c6f3b57008905e9e4a7d9f60f98ba2fdb6de476fa700c -size 963279 +oid sha256:899530e87f5fdc29c46169e41aa34c7395f3f55c174e19fc19b5eace118ecf78 +size 1050353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ba159021f9..9493987ad2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6759ebb918cc05bbe306136a8ccadf7b814166c9e078ebec954910c539b9952b -size 910935 +oid sha256:549739c7567d3ff1c7921ca9aa602f68ee26187758870bc44e229befacdc5046 +size 963869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index afa96e0038..c9b8371147 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d256e13f75c31c72bd398bde92bb8f44eb8e33d8bb45e9109b200e97d62831b0 -size 908183 +oid sha256:6a6520d2c617ef85bdb7f179fdf35da3bde041d9f505cbc602b0fd977e1cc4c0 +size 946663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 65e9aa10da..46d535967d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d1beae24225b0bb0a8738c5ccfd8b9dd8341ddd9edc0f0088a54bed0e124e87 -size 832053 +oid sha256:c869e23347e984a86a6eebe6bc9522771649b6812d43ba1e81175045e2083e31 +size 868313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfc4952889..75433f961a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6609667c3f905c6794c9898f1321636b0685c660bbd86e4ad4d2a8a1c620e357 -size 912661 +oid sha256:adceea641eeb2ead3f8a0794cab9d3c251858157d5fbd0f1483fca4943d2a068 +size 1005359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e547b9ba28..7eb8751e36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8170c2fb7fcaf3ed8520cfd88711d08c08bc1ac546c849f5069938c9e0945a82 -size 863721 +oid sha256:45a69e01311388f93ff60c42f9b0bab5e60dffbf326c106347703edd6a7d75f1 +size 921441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 501810cf57..394debf885 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aadabe24f044d550e776781b20a82295480306de8d016b99efb053a6161f3ad9 -size 768971 +oid sha256:7dc3eed17bdc3b747403da3baabd72d94bf2247536ce121b78a2557604092099 +size 806415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fc6aee2d74..6a05b34482 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a76600b4b37400d909240283c48250e048b63c752f18665f9a37fb66d89f5e04 -size 712279 +oid sha256:505a28f64d91017eca329516c1d93cdc964b0214294d5ff0dd177dc646303fbd +size 750859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b90b047abe..1a274a6988 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:936a376283e1bb2607177cc8abc6a7580393ee734f0e6bc6a8c7af0264f2b256 -size 947487 +oid sha256:a7b1ff0c3f31a955142e9d99504eeecb507dcadd8134684d0fdf1a0adb04393a +size 1035399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 0a52f0d54b..ddc4c540cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63aa2ec648e71804a514b2403529b8fecbdee6bd00b3a55a3ae676587ed636a7 -size 895191 +oid sha256:18293e5362cd80169efcfc19d4dc5a5dd5675b1d960cf2a8bc6463deb20e4d33 +size 948915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2ea5a11811..7aa0ba7387 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7f5f6a1b4b2b0a6fedc3ec211176a8f2b9ccbef365e51b8ae2779e1f422ebb7 -size 904319 +oid sha256:709a1c3e20deea7f6838077a2fe09fd851fcf26fa2a8b3307f2180c28a8b0499 +size 994895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index b03674118c..7b009e50bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24e6cc8b2f3c61fd70015ea83d71341b0cd03ca912211c80d72aca3ea1476e05 -size 855377 +oid sha256:67368f84d4dfeaf78e712f95239cc121b58be72809975c81e35aa2268b3a8e0f +size 912555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5f0ea38016..8ed2cf486e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3777101a0acc9a42ab08ef8c6297e021caa1169944dc255aa8a87472b16f1c91 -size 958227 +oid sha256:d557a086de1780390c53ccf027dc7b985cd48334ed9296a7a34529e7c582736a +size 1033313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1daf61600..d0b74d4617 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64a0a5778134aea382d8ae33ef15399e4f7a015b512df84a340005fdbfe37ff3 -size 861675 +oid sha256:c006ebd94d7c3a10daf5f1b1e03c032e4ea7776db3fb991d893b0273573c3704 +size 951115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 997cedba31..edb731f5df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:527229665beddf886038a5db6cf8d054238133e4b48074e3f5d0d2c75e3271f4 -size 910669 +oid sha256:f801a8edea52d7180ccd6897f5a714fb0104757e704eeb33fa3cbf8465c2d9fd +size 962863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 0c55488f40..595948863b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:183e2609d39a1520124bc2b7640cd3c12e04cbb68ed96b3b9017cd85241c2ee0 -size 809033 +oid sha256:bae85666693751c5e11b0b4e0395088b7b1290fbfb1c27582e9c5d23bd1815d4 +size 862413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 8c21278840..4cbc464c62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5a094734070c27d2c059cf7e53240f426c11bb0bac534bab899f23c1bfe513b -size 825331 +oid sha256:e39c402122fcbbac759ee115c2fda71c29a1291b877ecc6007d69ba9cc7aa6ef +size 863121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 1ef91fe3cf..1e984ae2d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff9df5872ed4d35df96cf7b7039a7b10ffdb488827e27557f4f8d88b52b623e7 -size 748463 +oid sha256:fbaa1cbc8b68f7fa6cbf9402dfcb331bf229a165ef39d9e3797339aa83938ffb +size 793651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 9e4d263e12..78c69be3b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a482242b9841133415ff053b022a7326fb8e940d8a1be61a0762e91b3d674ad8 -size 732527 +oid sha256:6c1a2bbce9d40377fa0a0ac9ded3a309f79fd8e7c2925d50e1dd342e3ff4a9ab +size 767505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a4bb5441f6..bbd72f11de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c8a9f4d96dff0355d743ecd6c2ef5a7fe267e0e124f69c9c329246579e8768f -size 677859 +oid sha256:84e5d5d16137a04569f458ac085b702015009a2059d7fecad73499bc94eb9d4d +size 712293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dd07d7c436..0d9bfbfca1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15791f313906c39fbf507df1a7153a266bbebdb60ab32c45f94312c3b657a54b -size 905389 +oid sha256:42d7794c6d1e30187016fafad1726b9a005a225d90d3067c2850fe8b01f8b88c +size 984521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5be29953b0..844f01146d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a1f753ab9a4fca5e1f1ebc3fc1f990749805f2b9335435389e12b19f4c01722 -size 815595 +oid sha256:7f0abbd35305df6aba7f858203439a28d997cdc22224173a8a79c1839f274145 +size 905777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 14f73e71bd..9656671a7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c719cd499bfb24012fd640707e296153af5c0d53c28626111638b8eee1f893f -size 861333 +oid sha256:3b943c2d7f8d01b3b84715b42e101e863cc24db35fd207ab0361be5973acfd16 +size 918955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 224720ee32..d32e2a2b08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e82997c27f8b632746f551957e0c100cc425cfdd6d2c8611afd1ff42734fdc0 -size 767443 +oid sha256:d1dc887db6c6087c2010aad89cb19cf53a485210b4fdecb2afa52f9a3f13b5de +size 821463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7f20938263..292ed4f970 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ddc89cac30cb4a9c2dff62d973679d30a1edbb745611ee11cbe482ef30105e3 -size 947147 +oid sha256:de2bc7045576cf3588cbaf89f51bf567c9cee8943beaa0f515c5ae0549f99f29 +size 1011183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 755c8c24f7..972920d2b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:542f805f9633069d92f37b626ba6931af8e02078f1382f7aed8a09e7a7720b5e -size 1070439 +oid sha256:b0a06da28d613e49b437bdaa2e75db8181c1b57886160c23b9fa3589d536c4fe +size 1077691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ce87613d8..76d6ca28ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06f9df5c9daa342de7c76abe9a5c6bd377073dab0b65234d7baa50cd4350d5e0 -size 912415 +oid sha256:e886ec93edcfd4cd1a4b187c87c8ea53e1e4b27648077bf43107ac27049cf74b +size 978521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 37d5429ae5..3a53c03e6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:633e33b0226847dc791d166b2dcfc1b560af55ffd65b2654195ff9c230b554e1 -size 934879 +oid sha256:2978edc4b4c8d68f7ce39fa7245affbfdcf6653161624123ff5a70b46253895d +size 934387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3909546302..265b6b2a3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abb14082df283757906c7020399adf5d0d4c0853f21d602c9c443cca334fe936 -size 932095 +oid sha256:33e8de5ef90b796aed876ddfe62d0db6485b7cc4368a6f578f290faa45ef0c51 +size 997561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d4636f10fb..c8e81eb4f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d24d955843b3480d2d7e70e687bc9204a2cbf34788b1a83462df1181a30c4b0f -size 902295 +oid sha256:fa384483252a26a45a0e02e7c87da5988193991aa4902c8f021671104ed6ac4b +size 970967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7612f0983b..4f006dd900 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdb442ce39e2dcc465c3865b024773c30a4b597c70eb3fa0ce6bb92e2fc30d41 -size 946239 +oid sha256:4ca4e4d5d75eda5fdd713695be655426d482157987ef352798b8cc8e793b0081 +size 993551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 003654caa3..0ad1d3d5c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:446076017f702e218dd7d5b5f2934c7df9f8a96fb1cda306a72bd1452b845b45 -size 854619 +oid sha256:c5360dd3d3d893dca6edaea5490d51a1f2f550ba5fe12a3bb75d4685a220a0ba +size 912191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index bfe1e793f0..3429df83f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:619b978973d3e1cc645aadec482e92d24be0b208353b9ba4f54b7e44df66263f -size 1093557 +oid sha256:7545e84e245c3d03bb180279fe3f8f3baf4126f7ec60522bd2836b5da1c60d10 +size 1064303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 6afef9e147..97d01d974f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4618cc75ce2ebfaaf5a1e50dc7b1494b08ecf7e0fb000dd327c8e680e34d678 -size 893947 +oid sha256:1d54cc9a27c35b993b095ce1200d129a75099e2e74af094d3a146383a8d4f134 +size 894785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0c4d389cf4..d58aa53aa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fb679350e8974fef419509986303f7dd8a5e87d52e51f08d899d632b58caf58 -size 910421 +oid sha256:4a2bf52ad38e21408840f91d5e7fe6f3ebbecc37b26e3fe610b642b8cc73547e +size 961629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6d443fdb27..820ac75c2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e3daec58851281bdedc018cdea275abeb25dc6f98fb5f42a3c89d3406703648 -size 822009 +oid sha256:8b50c34b4836db90556bdf7ffad0c85225d967c9a4dd87f6287019fbe86d40f4 +size 882393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 03cbb03361..a5ec883f38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c0a4a7efea99d4fa55b19978499c61c5e656c2c14d185c47c9de5eaaaa560ca -size 1164955 +oid sha256:18b21088cc1b2a5c89c796040840b6927ba08f497836509e3c65c11cc5583e3f +size 1327113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ab2caf744d..380461294d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36bf32e22078186b661dbfb169b4724795a844fd19c8bde55cfa205ba1db5677 -size 1055333 +oid sha256:74468b06c54b381315277c5e749920cfd04f4a8eede34edd3abfd89f5e1ef3a9 +size 1125633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d85660626b..cda42272bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91a3a4460949e19b84e7abea41d2c89ce0caec4afa27d9c281b2789b80ffb258 -size 1059579 +oid sha256:7fc99c50ecc97a4ab13743d00eb18419c807caf3ba8afd56b3263f01272d83e5 +size 1127411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1b1e6370ab..c214a67bb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f2978ab9e232d98a17c3a49654aad163e6badc3bfcf9feac41a4d9fba8b0f75 -size 1095935 +oid sha256:b687190d3c2675fda9ad933723aae691b2092c23dbf230ccbab524b74f4fcca5 +size 1270773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 848e1d1e79..c4f1922ce5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3a31f9425315990e96e1d1ceec86b524479154ff8b702d34279de2447deabf8 -size 989127 +oid sha256:1d9a073e2e2cfb44cfb955436605913855ad6ae23d447dd06c0b06f8d2358939 +size 1071809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 76c021ee6c..e418086978 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65ce4a9e41b130f85c6a16b809e20ef46931fcdda78895b960a067269f767250 -size 910995 +oid sha256:bdff4b1fe241e080a886889630e9a3c737202739f8d5cb8568b3ddd9417b5b0d +size 984599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4d42b73997..d4726dad7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:832ed446dc943f14ad185b6b13a8337cd94fb2ba07b2d9d8f3ec4e9ae30948d8 -size 1141811 +oid sha256:b5d39ffb9cd0c669923dc7b3460f3010475f3d91de297b4f6ab720274e6888d9 +size 1305105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index da9f97363c..52dfee68ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94c0eb239fc7bc7c6df8705246200c525e99989f7d24da3d3f1c27342753e13e -size 1031401 +oid sha256:a4c3f3e09cdaa282fed3f9666f5a1c35de1c1d4eaa84d21de54e271dd29f600e +size 1102835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3012b26417..c8d37efca3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:158c8158b59983d9d20d87ea0b2562aea78bc64c709f2bf74f4c19e93cbf8feb -size 1082363 +oid sha256:9b029e17a748ed9e9a7885b085640d0b5378bd9030953f12220ad7a718e37b88 +size 1256213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 59489f70c2..b28ac7402c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a0b05e48c8a766bb868cc525cff04c83e3e029be61fc992e947b84db1573a2a -size 975553 +oid sha256:b75a09c54e5f67e95428f99742cc8d455c64d322d2c0f997162159d5012f54d2 +size 1058039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9ee7955a1b..19a14b62dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71f31f421f2f02af7cdd139ebabc01f75725ab041d58e6114bf3815eb9736f1e -size 1156795 +oid sha256:0faacceb0fafa334a31ecd7ab4a90697e20449efd39a921d39ccc1eff25428e9 +size 1301095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a3d90377b..952700306d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af7809ae10f64a51653bf44f549d3ee4c98eb1d5de09c086d9e599f5cbe65f7a -size 1063103 +oid sha256:c6a2c12368e5cc1cede94d6455a36087d28b91d2796a847b62bc9087eac9f5c7 +size 1220525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ed31df1bb9..e6b498e1a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c362b2943bed072c5530c1aa7b2c5c494c45df49d6719c1c949ffd54eb36c66f -size 1047469 +oid sha256:ee4ba13764006ac85d6c2ee1e0d1337598d1af6fdd390ed8ff6930ba342c2ff1 +size 1126403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index eb47eda1ad..44a5f2d771 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d34361ccb8a9af7239d9f90be3f38b311964bd7374b9be746cb80217b3cd3e9e -size 952495 +oid sha256:4222f96e84b9afc3bf184be7ac54c4b4094fa4a6fa55c9672dc5e33ff17c030d +size 1016677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0ba6638172..20e648777d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb98c6afec651ab13fffddc51d5853595d3fc928df1fed30e8508024d9b975fb -size 901445 +oid sha256:5b94acbfe004573371b63993079017a18c7163e0166a76f6d2606eaebadf81a7 +size 1024335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6fd0d2f7dd..9abd800947 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20af84ed5260b9ebfb5714a3d89108c0b4b40f6454e254592f50d5ab07187fdf -size 877363 +oid sha256:2ac920768707048a9418796c77753cf70551784ec3272e5d1975af8362822e93 +size 947367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 577230cf9b..54f1bb0266 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5d3be28531f60ec66aeeea15b157dbc2409cc6fc6fed13eea73ce6092a29ecf -size 1084125 +oid sha256:ec947373204c0438ec024c4b6a0b0f46f3c0450872895cd2ff9fe04782de11f2 +size 1234937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b9503ce0e..6203097a13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30db844f4b9d82a0ef1c841c1054a1e602948a4ff17a4190632ba5c5d45ed19a -size 994773 +oid sha256:a7fafb1eabe8c15317d8b47c51b21a7dff64dba06397f8a8639f5ebdac8620dc +size 1168821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 041e7f9615..7e391b8416 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b57efee794694e3acbe53499a4cd978e42f2c08a5ce5e5e39d18148ec495288 -size 980769 +oid sha256:094c3344fc75df1acfd23f73a7a5458347f977ec61b723ca8d0bbbf42b84e489 +size 1067547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index a53d14a521..68e08a6742 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39c7e4041e6042bb907d66f83102762216c7858e51f2ad8a8d4386dbd21f19c7 -size 888655 +oid sha256:906c5c4aee31989c5cf1108aa618a7ee1525256a4cb14270a13f345324e9cc1f +size 968527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 617797960e..690d113325 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05c708667d8a8969e3b741acd0d31306bf00436dafc6a710ac7e02c6ca5c1641 -size 1104619 +oid sha256:7079714b4361b135947d8441a5a14c48920155843c471ef7e741b1760eb60c5f +size 1213843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 35f655f656..a40b8dc23c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16552fc79c7bb7aa50716c3b77b2a746bf27d1479671aebbbabf3402b8458621 -size 1023613 +oid sha256:28ab8f60e52f68771786061bf11ad9c00b3f662caa94e718593972c55eda67b2 +size 1077337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index dae02cf667..392839f2b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60bdc9194343694ef81ba891f6f68e41b01c6569df0e1a470fa60c9b198fa85a -size 968361 +oid sha256:76b31b1fc6933367f01e236f54bf76f24dfabee70326d4fd8a7531de539e32b5 +size 1013353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 518b2e1264..93ebc4fd3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d6999921c27204e715ce799f2214d593c3884859ff78e80d1a99b29482c4712 -size 1034761 +oid sha256:03b68b3788fb2aafce16812467f0bd8dc00e41d753b01d500892b4055e144d0d +size 1157503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e438db315d..4a2542fb07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae3564f663adba359fe0abcacc8b67baae55c236bc88672f30ce2743d209fdb0 -size 956567 +oid sha256:6f4d61222433f6a54c51652aff4de00552b2e5175018e11627fada880573ed81 +size 1024301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4772bb9b6a..00c0fd6faf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09da45789689770ef9cee319f05b059e66dd1129250081c1004d5e8e2ab0d529 -size 818987 +oid sha256:1a0c1479857dba980a524a968bc070849ee4b48b6ba8a00d69698332e9fca4b8 +size 869751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b997d0022..33a424e5e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20ffa77ae90c5cd4c83ee5dd406a38ea918317a13aeba39bdc6ec21fafa63993 -size 1080687 +oid sha256:6f146d04f835633dde03be8d82d9a7063103e9ed1ccb3e5b5d2b33b669c02901 +size 1190257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 16497efaf5..8cd06df755 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c66447eedfc84f46dc1d5e611b2bc7ee7da278793ffe55f4144eda65a0c3b094 -size 998891 +oid sha256:ec132fe40d7661f9775d34facdcf67682734ba39a5a316148a4617255f91cf19 +size 1054539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e618176be8..78ee21350b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed40c6e30519250a876267e61f52f149b1f8d6e92504c246fc1edfc6e84abf14 -size 1021239 +oid sha256:7f36c3fa94cdbedd7c67c6f075551846c5a973dba67c0d8239ba09b53a792ff8 +size 1142155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index faa32bad81..780e86a80e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a81c307b1c6684bca6c22c3a9d0338d63e5a507d0a7ddb728f5c7c1dc006df5c -size 943043 +oid sha256:8c37a86597c025afaf6fe9c78e4305fc133cf22827b8c1b11731693b67fbf7f4 +size 1010581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2740e84865..4c2f3f98e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a83410e00446ba05aed962dd418bc1a77c172c74fe50d7a9ccc4f47aeca26a73 -size 1094339 +oid sha256:c93754f987f0be8ac45711b6e95d684dfc9a766c9f7bfcb0845e03e1b3bd4fd0 +size 1188171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1fee83b73..9b51108334 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e73a0fd9805e6c71359b6b31b51ec6d4b7fedc737c56401a7902d3d1b35c624e -size 1002767 +oid sha256:f07bc31b644d8a1ad518f38085850b77ab793be14ebccebbe62c629b98b86b77 +size 1107255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index e3d80af31a..c0bf026b50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b09bee4612d11e6d2495212c1cf5ecd869f0b72b348d4f4700eb7eff60c0de7a -size 1021175 +oid sha256:f6ef05a31ed23613a295a0601eb4a16aaf4f84f27d50cf36b5c8917eaf02b5da +size 1077711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d3a4b965d9..1e02981ff3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0dc92f0018e828fc6f45d95bda473e44463517b515b86d71ff7c61a3baae6daf -size 920773 +oid sha256:eb720abbf2eaf429d11b262c52ac61e963440e118c1809d2873d1823245a0250 +size 969269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index db7ca322c8..634dafd84d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b954a2a3362bd0870779e1bf38ab17fe1c5e8c6c26998f6b9fcb1b7d5924a3e7 -size 840815 +oid sha256:e63ba0efb7106705592761c11b25d651e1ac9246a92d7689ecead83b75840fde +size 903221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c61dd8f319..9e9776e2ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ef545431b3333b884d1810ade760d3e4f3b808aa16a54047c9556e33c80a0b3 -size 784567 +oid sha256:2eb785d4b9885020e6b304d46ffab09ba5d940b1a7bde5a7c6f2caa7f07e7fa4 +size 832519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 78e706daf7..213e0de26d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de5363a753cc3b10a51ff70523669157d8101bd93e7b3d6d8baf40aa26395051 -size 1023099 +oid sha256:4aa60b0ad9822c48e277c75e338291c3d8749188995a0ee219ad1d30f2be5452 +size 1124973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4957f9260d..7496105d37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bd0064f3bc6f007802de11050236f8743020729f1b1fb650793c05543776b32 -size 934439 +oid sha256:5ba1ce29c7103f06104eaa49fe0ab65857304071de917ffcad0454d82e756b47 +size 1055553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 78cab51b27..9c9004fba6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ea49fba8ff0516f61eefb43b0b789a19eef5e6f26eccb4f15c4b1f8bbbc3165 -size 952057 +oid sha256:c2281f17f92ae49ae979788526e33d4492107329f721b306d7e615a72fb79624 +size 1018903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f21eb52d2e..1f76a789b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a9a7811e06541cda1396f9c39d7365a1a1c213e9a0a00b281e17475fa4ac35f -size 856935 +oid sha256:36751c93f6a3c6f2b04e70df82a0ae7eabceb8e8b01aa9318021cd9208d0da2f +size 921067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index da2c6461d2..963f26ba75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5befc32fc7467d63ded0f74d805506e888ad94c2fb0d3a858dbfb6d20541f25d -size 1080841 +oid sha256:4c9dec1b7c96a089d82a434bd3799a59c759cf9d4b051c71360bb832f0127d42 +size 1134615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e676e4c67b..4d2072c684 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ccf8ebee4801992b72251043a6e5d74969fc002ecce0e27553ab637aacc1d7e -size 1036833 +oid sha256:b72db5e5679336e823303f69889f8aae5aac69e577a9c42abc3bdf21fb136c6b +size 1095491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1107a6672f..2afd92493a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2abeb78210854d13c71a3d461c6ede2c713203b371330a8de863c4cda5c2ea4 -size 1057105 +oid sha256:cae9bd7e2f9676e3524bf1671ef775f71e1688f8ea0ebfafe0f7a3852a20f289 +size 1112359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9e42ae6f6d..4b04307dee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0309f7895256dd0aa23faf2972ae50340246e3eebe4ae5d3530c7e84fcee2075 -size 1023359 +oid sha256:817de57bd2c3f1337dee11389978e0fb3dbb2fa6b88fdb1548e65978a8984bae +size 1082559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a6d903d1bb..da8377a787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c06c3ad710876660d2bccac32392003962f89aae858a319b277622a05618379 -size 1080081 +oid sha256:ed05d7bc0f01b648e2561e212082e814ae23188a0c2ab5fbd5ca7807a97acbdf +size 1113825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e2cbd166af..0f0f215564 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a174c9911d99f8f1b736a7a00d3b4218b329932b366589efd36994bb60dbef5 -size 980173 +oid sha256:3aa74010c9690f4b3882eba8c943d919a3ba3291a435e888ea1e3cdf6e915622 +size 1032367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 71d8724699..72ac5c44f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afab8e6c3496049fdc9ce8fbee6f77814871acff3843423e8054ca7bc5fc6a26 -size 1041155 +oid sha256:4e2b256af9f7af9f9ba4dce3ff8dae9c77c656c3fa6138091e44adecd26eefee +size 1077711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 56558298ea..19cba9b81e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:979f043aae8eda45114d5526edcfd3b1a8aa84dc45af3dd3177ea57fb3257045 -size 944009 +oid sha256:d18aa486e4d32fb2df3ab0fa1b7264fda3cc1131a84da6b09689eaca3af9b383 +size 1000397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index eb07c368dc..48a3dca148 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f4f77b1e0c88909a4feb4042a21e91a111d23b2efdc3072c294db16bbcb52e4 -size 775921 +oid sha256:eb750195e3ceb466e4065b9e780273512063a3485c2b05cc646b49b8375328d5 +size 807297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0aa9d3fe13..fcfa0ffece 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0b4d453df327e8f3caba5bb441f8f4deed9b74713db1be53bc46522966b0b9a -size 753419 +oid sha256:c681cefe58f065cdf4bc2648d5bae2ae4752ae1187773f1415743ce3d0c7ed8d +size 785585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 78ac680178..941ec89c11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c89cb50f6a369151fa14093db116a065aa82f5c44002f919460c52c552358aa1 -size 733179 +oid sha256:2437af0b38efbf12fca997e402225d3db363af2098a9a50f3485ee8e570bae35 +size 753257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cf6c88c1c1..ceedf93e1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7bc0964329ec0dfcb0bc61149fdf810a60441df85a45f58b9dcfd5219a0f695 -size 651179 +oid sha256:fc45a8a81a3175c96b61b47dda4610458da31ab3f3a5eeb43b1f4f0e99ccf922 +size 656063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b7516589e9..685d57d7ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf1fe7caef5d3129798d7002aa0100dbd17d780a7e4a6a392d3fc2844acfe1cc -size 768767 +oid sha256:e1329ac9b5ab4bcc04c7a1754a1fe20d16408294c3fe1d5a6e3d6cf3589ffe29 +size 797627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ad542e057f..803ce08a59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a13cf28ece429ffb6c53e3e60edb3635555cf429d388986d86a5b598eeb06a4 -size 746265 +oid sha256:111e3dd164bb40f04d8df1173719a13a8a30e34463dc5b2ac9bf7e3cddf432e8 +size 775915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index cb7e57608c..fec3a27790 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2362aa0c05c4e8f8e0546a8c0bcd5f8b3b1773c9b765fdf6cd1aa6fc1e1c79b2 -size 726025 +oid sha256:b0bb290a727351d032c6a6224e603cb949e5f8ff7bbcb9cffea75effd731e311 +size 743539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 866113c644..fa0a5068f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2c1b552f30cdbc29ed4ab81f048ecced49aa197dc2d01efe47a32ecbe95563f -size 644025 +oid sha256:3df8429e00fdff07fd9d61258074eea47d2f2a5fd999d5ef47aa70ae2aa9bf8f +size 646393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 864e95ccab..588a464118 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15e8ea3643849fc1b6ac2477c3d8c542fbfafa197a6ef7bbd85a17f5a3a3424b -size 649969 +oid sha256:b7b88b454240c682b6d73f7ce03ffd7748f7769acdfc0b8f8644f77e360c2177 +size 664473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2bf0a7a74c..f5e58ff2c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f31bd2949471a1f9066bdbd3235c14c25faaa90e752473c613fb8e3c004021e -size 594466 +oid sha256:7ed2417be42da7417cbaffc4cb6edc3f00bf1339f61c23f7138833dbc6cd09c1 +size 616864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ba178842ca..faf09d9ac4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e312c953e234edfc1e8bd89c6f9427a7f5d900d8bf1526dac7cd1e0544665aaf -size 638369 +oid sha256:ed8deb30a5e603fd1fdb840968f9279a23352d57d4f1dcb32abdb78155d6a102 +size 654945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 06ed03c825..95bfaa2ed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11e66572a8325494f8995c0056fc2c062c55fc13e9d5e52f9ad67b31e40be8cc -size 590514 +oid sha256:4a28897f7846987291c316689b27d99f3e602aaae3b6c64ca4ef8e056d360f00 +size 611382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f3951a021c..48ff04c1d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5f2d4b6058bb2d8cf13cc373179fbcb66aeaecdeec7760090a0a05a8975a969 -size 641611 +oid sha256:dba6adc2ca1b0953d74c2ba0d1601809df6e63e361d47bfc724212e67c6e511d +size 655425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 61e7ac2bc6..c2ad144d7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c25270f3bda1c35c25a1b045bac8ad41b96a1dd3de7433f7233bd77ee91464cc -size 548362 +oid sha256:560b40c3945f07e18bba652ec90b94e646e29253de00269c447b34758f90b7ea +size 556848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 33807f1e1d..589c95006a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cf468b27106a82dcf9b7fcc1e39dda34bcd3d6768b76faf6248ecc0f54d8f56 -size 587588 +oid sha256:0823da7faa7e403fd4704435977fd4bb40894141c048ce4c31a77c7907a7cfcf +size 600416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce3a3eadb9..88ca2af89f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:626c53925c9f0e306acaa2e90c4451b2d301c87acfb01e17aa9e91a45e4a2712 -size 505490 +oid sha256:17e808c5f70cf8acb0936db68714e90d6a7a0bb4bb46d793314aeeaeba8d0895 +size 522264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2826fe6b16..1b9fb5977a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38ec0c2046584bebb7e7e7b6de4ce89a306513c44bb1f40180942abd0a31b906 -size 642815 +oid sha256:8cacfb255692f5ea283de64a672802515b5dd77a8568ba1b42abd485100daf3f +size 654803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9d45c5a48f..e3779d8182 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6554602f18250fef55e33146bbe41324708c222f695cdfecf3292e4b114d553 -size 587312 +oid sha256:d499e1545da70123430d56bc686091a7348d74bbf74e8915a962077867990d0c +size 607984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c159e8868c..d18dcb03ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:292c878e1412a0ac3e89b9ba90ef65e6b2ee59bce6c683fb57f15a540336b7c6 -size 631265 +oid sha256:3cc24be6e406af32589fcae6ccba42a6d1bf4344056c2cad6becdb15fa68fd18 +size 645277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e797db80bb..4d6792db3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1613483ab644a2d6a92dda308519aaf0470ac9062d695cf98b2029b979afd6f7 -size 583162 +oid sha256:dfac97c6b1996c319479f3212788d1b3844f84c49ff5d5acee693db9bb6ab3af +size 601712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4c5177b74c..2f1f1e2c0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1466803c2e02b3d99a79e826b3a3f1fa95d1aa58649af04e76d4e8290bbf4e2 -size 634459 +oid sha256:512d93f7c94ac1ea162fd117d8a7b6033738dc13b0b960be75661b07cb2b8e34 +size 645707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cc5f289da6..76f282c83e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afd580825f29ec2c59a754ec8438688655d69ae7c8cfa82f87ed5c0539212a6a -size 541210 +oid sha256:4634306dc7f1f3595286839e5ed246829bad2f61cc5f2d28a8f9a2712e309b1a +size 547178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 993c2f8c21..477bdb9c50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bda74f0657e32987a7d846d950ac92005047312716951927de2b8ab78c8ced0b -size 580336 +oid sha256:6073185006c6130737a9ec1fddf00901f82be49fb9251e5081a7be4217c259a6 +size 590648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4e6e625f15..8a817fcd69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8976210383c53146885e62747e6fa15fb7db5fce54ebff45b746050f4a4002e -size 498336 +oid sha256:74f9b2db91d0c718129c72788ffc75b1f62d69d12ce03f61109599fc1dc947de +size 512594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 177db13cc6..0137a4a35d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb31a7520373b9a60bd935e60c39e1e6b7fb5ab05cdf68982f2c537d57a01018 -size 669341 +oid sha256:17c213f64fd6a0ac7a6fe336363d80d3ed67bce7901266964d7d54e92173ced5 +size 682859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dae4070794..f48428c072 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92fa99a053eaeae8fea385e3a91007cc6ccecb96f1b344cdc7ae396b8b2d1bcd -size 602690 +oid sha256:a77c801cc02deaea7d62fbf7c887d8ff098fe09d51232d91424033013b1db400 +size 635251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index faee45ce4d..d9e28c9323 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b5cd1a41fe63e9292e5dfd74ec702fea4b7f9ccf98feaef41f22e17bca44e99 -size 657003 +oid sha256:43f687311e9a59d53656bb9ae5ec892ae88acc8cc6c7731ed9487982058a48ef +size 672543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c1164b3cc0..5916f2bc15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee8f78883a0f337e7be058c61c3d3f0c9e207774e0dea4859804e68db9bc51da -size 607716 +oid sha256:767d0b88c8708fafd2124a0ccec0012fb5a6bf0954201fb1c017a76bf13bce97 +size 629029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d93a1e8636..b1ccf7b3aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ba3e5c341484a9d68bfe97b2b7649ee8175287be8c157e5e9be14f1341cd411 -size 659701 +oid sha256:51e899b88a16c3060bf9b1e6fb121b23cb06cbf82e92fa2e286267bc1c57dd06 +size 673663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a99f7d4396..0781af66d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cecca88e497532c88de5f4a69cd64daf2ec83ab51721392674fab23db8e2ae57 -size 560828 +oid sha256:6eaa999f9dd6c33711199f89fce45cd14b404b57de1105830d80c22709e048ba +size 567340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cc2a5ebf8a..24222731ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1562f19b055cb7264e8b909e61977455020e426afdf86b3e847e92040b166627 -size 612338 +oid sha256:eced9c8faf1ed6217ad4a9ab377436020adfe6f7c25e235bbf9a03d58423bb72 +size 633307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index edbab6661a..e5a8d1abb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dc9cdd3d14af016fe547f28035e3c359e6a1d590f861fb495dd9fdba123ba1f -size 517956 +oid sha256:2f712f99a07fc7b23ddd81b197a9175ed82d33bc7586f05731d609c15cbe0816 +size 531966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index be933ed4bc..4de4f8db67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:808480c63d99856ab26d664f831027d8e192b4912af0296a4c00708ea7917d76 -size 662189 +oid sha256:93498cea7c5d7d385dc2a4fc012e4be048e4c1d4d474ac49828663bc8ae69685 +size 673189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 86bc5ab242..5cd2328927 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e98e9770d1228851754ceda9e197066031503cd3abc2a6f8c506eff19a83b5ac -size 595536 +oid sha256:da76b6be9844850c57306655d8d7c0b8c0181292182a1b5312c0e48a012ccbe4 +size 625581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index abdace99ae..a253b48acc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7535dd535f310db1912dc690c071240af752e0ff8e7e271559f067ae6ab00094 -size 649849 +oid sha256:48c29b87abf944d89507ea2927bc331e7bf1abcd76c2946b7e82a0e7a525b0fb +size 662873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ebfb25db48..2276195f58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc27edb57760118de4768048c7119e82b0f6a81f543249404623cadcf298b37e -size 600562 +oid sha256:7a0fb6f62072779d55a4ce4bab1d930629d3c523dbd638497d8e515781a9cbf9 +size 619309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 33efc36348..db43868d23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38d9ab413702983d3b5785bd55a11ba89366c21ccea5af585dc9460c8842dba3 -size 652549 +oid sha256:3509a23ac31c65377735c3dac6f277116280e560ad7e092f7315acbf87559679 +size 663945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 50c3f28fd7..ea8f6f7d00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a1b66dcdc32d2bd1a65990fe432ab812f87cef86e1d0743c9c5de0ec3c11f78 -size 553676 +oid sha256:ca947a21d7ee97aeb1c6b07ce9e47efdd715e3557dfe7750b6e0706ba1b7db22 +size 557672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 444b264913..b75e3cd1f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f54479401735edabbf0c27418e14cc1b29f0bd37c2912b21c90ffb1b6978001 -size 605186 +oid sha256:276972a65ab532c6b43f35693d12b16a896d301cddf615fd8948e2100383fe1a +size 623539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d505d354d1..18f468cf80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c34f4f746f02fee5425a400309fab2b341e6c59d054d95554143c61b4ed315e6 -size 510802 +oid sha256:61d22edaee3edaa0dec79d40e74954a1e31fab3c4ce38a1ac19ccafd9bf05266 +size 523038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 39327aa78d..af0f2b5fe0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dc1bf9d41d3dfeecedc36f175bc92bdbb5cf0684bb251247bc9f1019525cb29 -size 747699 +oid sha256:1ea1d3d35d00f267247de31c3017ce8a3e0e138e4b6a71bf8154142fc1371c4a +size 778877 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4b3ffd8472..eec905bba6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7ebac7c16f8e90a3240a2a3a3fe345ee226e694fe2c3be574f7b63ca358b97c -size 697623 +oid sha256:a8e28b6a55ddf049cbd5cd4941d114d32125b0b892439c501160fab5d5ed4a8e +size 737139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c91c3ee268..8dce3f42a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e492c739c8e55f3b83d53089a4f993d89c85e792d5a33b0f6e9d2144387ef9d -size 731117 +oid sha256:66186e9782215abb75aa78f582f71a7a3ead04c09480f3020d193bfd9a4b2a82 +size 764417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7728b05e06..6915865912 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:000fc7a730943c6afcc5eca262992fe44f2abaceee4b2613cb0bdd2ecb825a11 -size 687553 +oid sha256:02ca742dd61df64c2f85bfa5a4a37999724d1d6b71838a1b21801729392d518d +size 727957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index eddb9080c6..2a74c31be2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16f2436a0f14651f015cbece8f5dd3335add1b642801eba0b74b9dd12aac93b1 -size 739095 +oid sha256:2477ae5230161fd446a069f9ab6fa98b1775d3c4c91ef68dde82eae84e2bb17b +size 772591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a03b5c84f2..860139dd97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7d1f3e97c376d6170f3bd4d5d800c0df26b921d4819ff0f30f57513b3740d6a -size 643923 +oid sha256:4373571a3abb65a6c170ec4e0953414c1330f379ac8d79196c6220030b70a296 +size 665135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a56fa699d4..23c8fb25a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcb5fdc4767c00a128d651a1a138bdb9cf1ae4fea39d3779b0bbf5d2b836865f -size 680485 +oid sha256:a438e80aa8036f50a602f0999ffe3cac1cb65eb06dd3c9628587de53d2937912 +size 706631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3472d94fb..e5a184496c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3805d084b876f22674d8d73ec277092c895f594a91bffcbf5071a7c6ab83ca42 -size 598434 +oid sha256:31007b012a189bd83ba5f42ae6a32ab8768c24e00baabcdc858584c9a8c66d59 +size 627345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 761f854209..bd906218e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dd9c7342d5162eed0769a31d0d92f42d8cee000d17f39e084684a1127591c06 -size 734181 +oid sha256:96eb437b91b51472f439903775ecae58cfb1e19712a92b404179ae881675ae17 +size 759489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7de7b8738e..0e3b91a16b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90ad4e88eed85a563088a7a7aa2bc76ef754e4b6e019a23c370f87460a7c1f45 -size 683365 +oid sha256:32bbc9576ce04f83582cad7ba334c031ba04e5db10ff037147f3baa5629aaa5d +size 717751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c7e5b52f5..5168cf47e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f7e2816fa94f92e42da1124d4bd20aca16d648c336bb80355bf9d0539930eb -size 717599 +oid sha256:b528ce1429f52d066f4d768bd65ce4fb10983bdfebf40125c1df1868db4a0125 +size 745077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 927c669055..c1d9312892 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b32a911663ead9dc2cbb2e8e30c99d729248d3923ee8c2e1d0bb8c1e2e8b9476 -size 674085 +oid sha256:fcc51074a3adfaf729865faf07c1a9bc97b170be47eb910cdc2555bfc2903dd1 +size 708619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 59f943cb4b..5de6a36c26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9748db982a904e779446747e14fff026170332f6265094780f05dff8087508df -size 725577 +oid sha256:82f9910552b0de35a73e553f1945465bbdf6c63f18985a972d8a61702e77e84b +size 753155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 347974fc7f..4acb0ef389 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:669315ed58e0bbd0411bfb65e94ba828b5239b37b5b7df4a0160b194e34ecab3 -size 630405 +oid sha256:4d71f32feb29c7aab283f073d07aee6e5a74c0c3f552fe73d1d18d1ea6f0f408 +size 645747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c93876430f..44aed07dd5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c682323ea13a4ac6fdd4f766c0bb7add924ca3740b677270da08a1c40c83539 -size 667065 +oid sha256:2ceff53302debd6f821e0f6fddaa94a183452c3a75afc3f24ba56042003936d3 +size 687045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index af450d227f..b16db6ebf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7958f565e79e2bd484167830daa517e698407acc540ae5f1055ce685d53578d0 -size 584966 +oid sha256:92b74e502b4a9cb7780c50f1f259831940c7057c5542b1b57f7bab2a18ed6a31 +size 608744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 721280f16c..d5e2b49fee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ee42b4fe8d52bfc766518f95abf35b908a2d25406e7874f895a540a2d424d36 -size 767269 +oid sha256:720638bfb2526e50b234e1bb3f22988613d54bf77ee7b041bb3ebb07efc319ba +size 798053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c5c7605f94..12cb7c93ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbc9832b4abe21944a37c2e6f74589267e289ab6c7edf45dce9c77043ab94510 -size 716601 +oid sha256:c990fbddd555612a545f5874e76a8df68a91e4722c437b725d151b58f3c399f4 +size 755525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b05a7501d..198b805fcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9594a6465b7d8b5dbde3972b5d54ce7d944f76b5b3d27ec72b45f38ac926f40 -size 750687 +oid sha256:66257501571d759dbb06cbd203b09e823eca079ef6ac9187e00b9c8afd9e1a0f +size 783591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a760dc7be8..a1cd0c85ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71c8d08b8cc98d785940a2db65ae7a2ea86741a3994677d6159027aa2e8a9d82 -size 706531 +oid sha256:494154fd13093f78edd9c994bd66a1e045a3c99e131c2eae8edd461a82d2151a +size 745603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 95abd443c1..8169ad7994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2b68bbcd4d1acb271adcbae81950abccfc58259480f060c31071fe4b0efb237 -size 756395 +oid sha256:cb295607138805db482f27ba7eb6bb7cd7103d8fb8253699c589ffd789c09531 +size 791421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a7f55ce94d..ddeeb84ac3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75b5739e78526b292cfcea3f66e5f7bd6764ffe024eaaa8372c0a846bb0abcb9 -size 652195 +oid sha256:4601af2d0b63c601a4ec7678de31e405052a19d2b05badb583a2f7765e6cd8c0 +size 673409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8e43d7b062..825a18a1b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d504278ac4c76a74dc73d020dc1e2030b5a511755fd8fd919c35abb029d18618 -size 705135 +oid sha256:31277835fb8253e0a56deaccbff9838e2d834646665c914f43348dc7748233b8 +size 743615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3b260d24e1..8ddf29c128 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74945ce97e9ca00e27c0fd2d2de288628afbf37be25ce987b910c37ba1ac5448 -size 606706 +oid sha256:a50328bb792e0e70cd6dd2bae20b9f9b344361993802923ea8ccd5a8a0a38f3a +size 634827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 22bd480365..4bb162d089 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c59985c880f946112f14e919223a401a82add884701138a6d6b8ec925f62139 -size 753751 +oid sha256:387aea135785fae9b10044a72c1cd4522eda9f89d3d5e800a7ec12041ff95104 +size 777925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 55f1d3ce49..b5aa0dc7e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a55787a5d96ee14716fc30c0fdd9af34e1318f129fae81c9ab6a3382d33d262b -size 703083 +oid sha256:a61002f21e8969d7dfca4909f8075620a04142e3da1e7f7eea55ff3e1d92bbfe +size 736137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index af74f25368..2b69997777 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22b47a25dcd73d51dc921a02ae417de1839b0580e281d86dcf4a0f432b05562e -size 737169 +oid sha256:b50bbe4c21b676c4c7c60691dc3301db272ace25f42bb22d72ed810e90b53904 +size 763463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2afe61121d..55c7fa30f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:524cdc4c63719d432a93dc4cc9a2574389c5e375111e6b39def97082f1b48fad -size 692225 +oid sha256:53d007ba3a67407a40bb9c66eb651619a5b06eae435850a72300b5ab648a83ec +size 726215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 33cc0efb76..24516b848f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18e87f42ff7d9aebc925c990ff31651945c959e137c15479a4d5360929b70682 -size 742877 +oid sha256:13cb1c0b8f3d986412d2e7f63d2df7c90e70e3e7a7d8ace81c95713e5a664a1f +size 771195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5407f4824e..babe6362e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03b1839d309a3ed82dc8aaa72ec73aef325d4e3fa98e5558ad2b5ed2403bfbb1 -size 638677 +oid sha256:9eeceff9d7fc7f67f7101470d40343b6cf6f450c1d239fd24ea44b0f9117b74b +size 654021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d6a234fc67..3a09c74376 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32a31cd72d9fe98dc2699c2cdaa90b67bc1ba47387591c5f5d5ebb245a2ce14e -size 691619 +oid sha256:eb8e5e555724068a26deb2c5c7d78a8891bad92e9045f6df93e2f3f22dc3d370 +size 724129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cfa000d896..1d979de787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fc3b435c2d53ffc197f97e3b1f5bead1cd38d81f63b9581b56faecd97311583 -size 593238 +oid sha256:90f46dbdddf5a650498bf1600f1f816fa36ba11990dc22192cd81d2fc10d4d4f +size 616228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 154d35423e..471cad14a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ec08d4c2582d48ef8f745393bf7f6e12661fa0279852d8ebd68a0196f943e93 -size 704287 +oid sha256:4c2b829c4023e1012512c36ec20a41b27375ed485f90a4e1d9c5f83bf86e7620 +size 740005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 32292383eb..ae48498123 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ac39c1699bd29793ad99bfa905bdec1f2669d36da462b3983ea8674e241392b -size 695351 +oid sha256:e92f71ef1063767ca4254795aaae594f568264d97b5356c0a83c2ba726acae36 +size 718489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 71d04e150b..55f3f72515 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb374ae08a7aed1d477d4154e6b7248da6cc26d531f4add335b3c4a44b4248fe -size 679205 +oid sha256:6cea294ef77005ff9421ec24bf27323c42bc6e9508a5b4227c669002d303aaa6 +size 698939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6027fc2f97..51da8ac9fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f678588600e473668b88684bc2b4fb07107f2cb43d582f44df42e250c53c921 -size 592468 +oid sha256:964c13aec1246a629ee3b3fba4b741efcfda08555e3d9529ec16e3fb8c41fb43 +size 604900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c14dba6a3c..ba4e45076e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ed526bfd5c6c1a168458d3628c825b8049b6bf3215e2aef6fd9d3ae7a33325b -size 697133 +oid sha256:08415038079827522a73927a7093bc2c8c9dc70ab0e5bfe77caf0185b3ae9006 +size 730285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8fa8885132..b888815bd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f55dacc78f673a126fc791aad2afda5cd1e7de5dfabddebee30e902308a9b809 -size 688199 +oid sha256:a9e81e52ecde6ce7548d55e1fee6de16a3998332d395dfb4fbc1ca1528559f66 +size 708771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 2b5e7cfe20..a2da2fc50e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc0e175ef337841e8f882938cb9865264970414da30b53e970b11b1355315e41 -size 672053 +oid sha256:94f62ba2b491b94d93053fa706fdf3e2995b95df780538d360facdc4bb667828 +size 689171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 12c9ed9229..785248669c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0eaf016fca1d00fce468074f6a5a239da1ea879b4129cc485f36a9245e809fb -size 584526 +oid sha256:ab3582d1a5045878a7bd0beb6d219297af74884969be168633094f8db29bf79e +size 595232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f2d24f6144..739840347f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e183d6da6884d46551c56511af241dba67b67d12802a9a7cb8c5066ea9ba7f4c -size 627669 +oid sha256:131f2b7a896e8e6877451cf06dcf7a4dc6355ea6f14ab30e12a460e446612181 +size 642617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8dcb13ccef..d11e2c7871 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dd48505d501b3ffd0e191501c40e7bb044eb6fea2110fa17ae4e3ed4848e349 -size 594366 +oid sha256:4dabaccc01fef099ddf30711a0f8b697202a3b40decff2ea69cbb77606e92867 +size 616714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 342fc852ef..9d5bf7ed1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:106a3b4a256182f6f1c2edd539fb01f2a91c66b74e98781b53c1a8729eea434d -size 621299 +oid sha256:3a9566d03229d3fc42b64b43e0cdf845e7cc3ad3a09bd4fac32d2bec4ea75cf3 +size 635555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e1be482604..eb7acfd38c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd13428a05303612e3bc773d6ffc00a9e4c3720651da4372a46ea75ad5bbf754 -size 589624 +oid sha256:8ceefc69192f053890b5e9dee785af1ebc912f6d35a12bbd870235624ed97752 +size 610392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b84cb7d002..6a4ae2b253 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:287939a25d3c5751f9f28ebce00159976d454019c85bf078f0dbf76d50ebbd5d -size 624047 +oid sha256:d7a61d6c4d9bcf7b99e64e6bade8443a3278973fa37081ce2eb566f76e6c9ed0 +size 627993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e2c5c57d0e..f923e36ba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e2c0243665c5496ec63929bd39d300d5a469912f46020bf6d6e05ca2c681e2f -size 541306 +oid sha256:15c1a6c592f6455f6d34930c791192cf87185414f6810e267072bf2ce268ea85 +size 547768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 87b1fe79e3..3fd645b98b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5d01eecaa4434de8b6aaef9605ec82765e213a2f046a06e784daa7406469284 -size 589462 +oid sha256:215b5d3091bbef24acbaafab301954898a391e75e5e8a2e78318648fc7c8cc49 +size 600068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d840e4b3f..922b3c7775 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee0ba2dac5bdcea934e67cf31d66165dc314c089befa1ca839641349549e1617 -size 506326 +oid sha256:ca4baf01820c282aae16b2bf0815bc1c6788ff71b05e8c70eb7ee7ef5e6abc0f +size 520486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a5d434bcb7..4daba201dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1e9f2b00d1bc2210d8a4ca0ca58195f62e10947ae02b145f66d971fb6f604e4 -size 620565 +oid sha256:ab49926de1e346e0ce587063f908fd248a64bc4703481329469fc1d28f1325d4 +size 632897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3e546800be..1bba405d71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27d0ac3156fc77a3bb7a3152f44c4c09239ab817466f165f133440f876eeee0e -size 587212 +oid sha256:ef1650a0afe4c9be7c370d0db4668375455696cd762db8eb4d2108315a351346 +size 606994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9a7f906c93..db66ca9e4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:182f281b84728c4c706103df1cc6c0c25f5e9b0f68327ad1adb2e188a882fc33 -size 613354 +oid sha256:dd16815bc0d9f156190d5f155a17897e3e74b21e0f71cfa9c151fd890e7b0694 +size 625837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bd456991d3..dba86d6c9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e7e4a5595d14cd1371ebf344660454ba3356c86481d30413cf366cd1848ac6a -size 582470 +oid sha256:a569dc8c23d6b1149d04c581617954d9c42fa7734ac08df1531ce0170659294b +size 600724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2ac08ac0fb..a0f82b3c6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11c4b87cecc1144d9cb27ad0a1ffb96bc5e93c34044da4e7fe338a5af76cb2ba -size 616992 +oid sha256:7ca96adcc1890ca4d23c105725fcec400d884f76e650febb7628e65ae52da910 +size 618177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1587c8022d..48cac2d063 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18aa6b82bb74a89d522372e83edb079163279eb25c9d60ef852b4af3437a379a -size 534152 +oid sha256:efd977cdad120b67064ee344b23fd3b08d145d2acdfeaab4658caddcd3fd9598 +size 538888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e8edd89de1..d9b446528d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d847326caf3fbe4ba412f201d3283b5f97f6c93bca0e968344018636a79fe19 -size 582406 +oid sha256:a387aca1cdaa99c3968be17fdd518dfc3bf0145210b9b318fba45d9bcb67710c +size 590448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fc54515464..a5dc1ed04b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce02a790ff216dec7fc4527eb70ce29fc86867c76d1775f901cc37e1ffa5daa1 -size 499174 +oid sha256:e361c71b778145b084f6e3e736582714b0375c824ddf160fbc17da1f95193731 +size 510816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index beb0c5aa29..9f12c667a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d46fb3b0af5c0d7c8fef44fc80ed0a5164824c6945a5e0c6be5a959bf1d7987 -size 647879 +oid sha256:5608500eb32a7cf16cdd085f2106244d4d8b407305321464116ba1d05a1a442c +size 660213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1aa1b150ec..dbf0bf7450 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a194b8b4cc02797966208651f9d69601fe02df0975ef2eabee384162a819e02d -size 603476 +oid sha256:5a2fdb56e62f3973fbd502e6a6d5a71d73461887cd6d08fbde6fa4edf4964211 +size 634311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1cbdabf585..32dcce9a46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d34d4936fe5a1d27fb2144f9735f744fda0c7a70efee7d00f6cb8dda67f104eb -size 639931 +oid sha256:68f2e00a580d037021bd3861c78e2cdecac67145ec270f927af00af931d7028d +size 653153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 815607a0a0..da0962cec2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf241f117c8a86003629c6a1e2f9f2f2fdea43acc1f27e27e32b0df03e087dda -size 606826 +oid sha256:7d22138b724e8c1fd20cfc294cbe97de86fb230e4b7c9d4a8b9dd697990caca0 +size 627251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6fd91d085d..5882871a03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d0ac6f33482f10ec51c6424f27b2edd0dd3e8b738dc57621f5822e3011e9830 -size 650819 +oid sha256:84b37596080527c171103dbeeeea906a7153a924cba34e66ff797ede548c47fa +size 662759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 268a320a8a..57054e13fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6da5e9ec1e594d4758f6d831700450c44d9871f4d4910650131c66f5f3aedcd -size 553822 +oid sha256:e96b48c8ce10c635b9899f13717daa3629948cbffa4d802c1128c6c6936a64ef +size 558262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b72cac6128..34091372d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4284f9a18382429b7bb279e175ab0cadfd31afe20f50c0745eaf8c0b3d7022ba -size 611548 +oid sha256:fcfb9d6db75500ca834f33447c3e79644b5cd4655c013fdd094f279c294c09c0 +size 628569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2f777a0f41..f86ef65b36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a4470d714ff2b97176bf672e1d983d3618240c8de76adc3223f7fc14640a874 -size 518842 +oid sha256:37e235a8a0734c0586955c577701afe174af2a31589707f9e6856e32662dc508 +size 530978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f80c475ba0..93b2d46ca1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14dc81820696b1711295cdc5bbae1f94dd9d786dad2ed5744af25eaa18a13aca -size 640727 +oid sha256:6a453850687de8d651dc37ce03473855b4796f361a808c61f8b2677d764a90b1 +size 651333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 947a41e030..2c2c44d35e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb88b283d5f057bc4937ab2280d66c58ea1a6cc33c4f84be0a0a71cb3d32a142 -size 597112 +oid sha256:ae65dd742831c4d30cf75398e11dc346a1ee22ea27afce4dac4d429a4cfa911a +size 624593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9e6d364080..0e25b63029 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c3dd723a40b04f11e325669ad0d2e6bb27a7f688490aa3dc3b79f8e8ed71059 -size 632777 +oid sha256:65d2844e3c3f67e314ce9e7ad8e99dd6ff8e0da70f4a20651716c94fa8333a42 +size 644273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1023f8b6a9..176c6c2604 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29270c022c7c8b342605dca9bb141a2a161a6fe409bf404ce3d340ac0d0ef462 -size 599722 +oid sha256:e3f079c4e50a9625320e10a319e2fb463b0ef114335a349621a9c2cfead11763 +size 617530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bcf9fee6dd..4276a3d7e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4caa9c37a17e49584e51e6b0dc0fb827a44a90659e18f5c4c5a9fada63d2166d -size 643715 +oid sha256:93543c28b554fd1ee883c3fab1bd1d22a3c5f2c954d40cf653c9d835cc87c666 +size 653039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index be9390ef94..bde7bdf426 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b60bf73733ac5916b82798c42fde65eb67c1440c152002131498d2a6fa9fe561 -size 546668 +oid sha256:52c681f4df6f1fdb8372d035f9bba845192694b792a5caad0d714f0679592d48 +size 549382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a3e868d4bd..d7208acefa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25ce2ef4f41f86b09918e1a15e3bd4bfbb151ad7767dbe931d6dc97463e3de1f -size 604394 +oid sha256:9341143e937b80a61b222a5901d7acd03a5a5e1d26c0e6e35297ce8389fffccb +size 619639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2d6557e830..3da122ca85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5996073f0a4fd6219fdf660c23206c5962e9c4170cf6e7490eabb0179888a898 -size 511688 +oid sha256:2d933d3e0f3795653a6e9bd605b02d7ee7f27c09a2b1a958a5e68ea2b0cea5bb +size 522098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e7205432f8..80fc66a3fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d143608f3c8874ba25c32802041b08bc9326269bb2c1123f41baa07a3e09bda -size 821143 +oid sha256:bf928bc38b6a8bbbb8253edbe935a65cd955f86686e0486b031b3be2f8f5bf0b +size 858241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 071f056c64..fcb158fe4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc92a9967fbd82f3b516fd732515002f6f6dea8c438005887e06eaf8e999359b -size 795389 +oid sha256:a4b3cb1fe0b137507089f73aace7082d49538fe072d6bb8264b4e611ca1cf409 +size 823657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6876e7802b..3d81b7adf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ff88ae2d97cfaa50b05964af46490edf134f13b6741e4aad120b42a6cf54315 -size 714779 +oid sha256:b52e2ab0e057fb8574ac0d6106e5086733c2f2ae1eacc2ca2d0d08d6e7f59a01 +size 742159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6fdd676e44..1479316090 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:717441d8bb4ced220d2558f78b4a4ae8dcbcafc1afe065a648460e32e8a35468 -size 767417 +oid sha256:8f1d894c283a14816d0bfb73203f2c15c2d1abcc0f8ad73be21e08ab12b7b6f4 +size 809053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 009530fd17..b5c7adab6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf48db6310e958a01c7769e9d345b71731651f42c036b8cbb1fb5449e2e9f485 -size 743487 +oid sha256:fc31b8e6c86c9b0f0145429df461a6b79be2e297db20e6e3d8e9ab080c9bdc99 +size 776195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4255861919..768981153d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed79d375d70950734f59c6b968bb27d2da0ea25273ed5027ac4970c1a3f3a336 -size 611730 +oid sha256:a3488c96bc3f216f9ed532bbf59d1577a5a7a3e81ed7772acd501387d364b0d5 +size 638321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4ef1eb8bdb..060a6cb512 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5393cc04ee1778e68602243a4085a61ae3a5b9a4c1854f676a9c46b837c71f0b -size 811023 +oid sha256:5615fb72f1eeddda09f36876481e0eae3331b91a1e7a46b117dd96d8a627ed7e +size 849059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 2b77fcac37..1505e74e1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9d78ee8c9e9e9c14bc3974b998f77572323e58d93463a85905dcb8ccb16f5be -size 785269 +oid sha256:31313e4b1f37de6e10fae6c4a3ab821a87ee60148179f37adbf5207a1b52992b +size 815263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5e7b9b9148..8a330c80b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0e8b3475fa2e4c6a0366fe3d315e5d0fe78307be9efe15ef81279cc2ca91bb3 -size 762971 +oid sha256:c8f0625512125b896cb0c7dc90dc3b1c8016c12635391ac66f13c9f63bb5a279 +size 802881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index fbb9606e51..7c04b3fe48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:601de265ef0ec1cf8f48109a91170898ee0728fcc284c0e8f6ab7d6480868858 -size 739041 +oid sha256:89c22219a9c155010d5936a9c8302285e8f7eefbef8ac58cd40ef3d7b4ca7f8b +size 770023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0b7975ef0e..f188415cda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e17f1594388d4546163841989391d1ba82c3347048a1eda0de079b82d5f49601 -size 820629 +oid sha256:b7429aac8726a3edf6930065b34cfcb6e03da50de3fa277d0a1c505998c99737 +size 857087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7c5276fc5f..82d8c86400 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:553c227798787e5e9795a0385ff494d4b6ef86c0eea2a49983e2f7ee20265927 -size 724125 +oid sha256:debe5cb2b1c24874d9d084a89a863a2a9dfa6e8c546a26a7bbfab7ac9070d97f +size 761125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index bf0f4fd525..ff31a6bb0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a08526f6785bb3cf9b8b80e5d1be8ba57d85d91097be563ff3e53be948e0271f -size 798477 +oid sha256:cb0893148e2c17d105409afbf2e7336a568176719024dda001eed45075007104 +size 823341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4beacc14d3..a8c9cecc7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b08c38d850f5f3cb3aebef1d7021248c763e7b2a7701624a113fb1850014656 -size 697681 +oid sha256:0fa1de28d522d9fa119e12d888c9ddee743731a8fa560dd75ad097f87e5f3e9c +size 726589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index d441819aef..3b30772e1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b1d4e871de7ad40afb61c017759095caf7c35950d1e6046f890dfbf0c88571a -size 658323 +oid sha256:62b3a78fb12619208db9804b0f401b5fc4e17134433577deaaa63db51c2321b0 +size 688169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6f3f142d80..5f57a5e28e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8102af2eeeb7c9fceeb233fdabcb9a52c5143a061a6a61a82acb7cb07c5feb -size 576666 +oid sha256:975aaa6a646cd6e79d52086ba591260213a8841b02c24285e7026761d93607a1 +size 600248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 44e74ef198..66243f7852 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec9403511cb79ee28c8e4e793d4eb983d988cf94d486131daf7ebcd526537f7a -size 775389 +oid sha256:a4d6857cf81a082443512e143925fcdb7ef7e88a7156393d2dc68080fec24585 +size 818161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 62deaf9647..57bf07784e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a13f3ef142a814685064e683bc6d5589cd1a51b264856bbfe0eeb9ab5f3d79c9 -size 682733 +oid sha256:2e3b40249d7d04cd0f1c88778a1da50e3a2c3d168f23171c07617eaea94e06a4 +size 721557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index be5fbb13eb..e23f0dbb56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24e6066eb5f8c85f6a29b41b01c9949e478fe7a90e68bef8861f5ad4a7d88540 -size 755801 +oid sha256:1f98a7acf6aca6caa8df439b770f08baf9aacb6f4c9daade7bfa9169175e9e75 +size 784563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f1ec6dad84..85de95db66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c4381dcbe53706c30c86dba6b7217aa357d47b91d6f240b85e6e5263c9dab0a -size 658063 +oid sha256:8667b6266d3e9e74e8d775c5c4f2edd75840033b7372d94dc4d4c3a24e25ab41 +size 688749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 24649e8392..3c13cf853b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c081af04cecd37ca60c47a61f3de8b8c41f5084c6888bc7469df841f7f69db5 -size 800521 +oid sha256:5baa138c205ad494ccc8494e97a23842af3d0fad69a47819071235b014c8a18d +size 828493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index deb8c64df5..96de43e4af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02c991c62bbd7f3a0451f3ba44fd257a1d7244cd3631ecd7a432879f39cfbcf6 -size 785077 +oid sha256:28cbfa47e5deb2739e2262b8e3f645711a2999637c278e8a855ad89fc90f5ee9 +size 807673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index dfb9b870e0..bec05d2a02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea618550c1fa5ccb585f27df29f61d0300de0851e3fae92a0d00cf2e6e9a07db -size 694109 +oid sha256:0b5579881d0f253e006935465f35affda3dbacbaea405dc9a22d817457178d24 +size 712411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a3bb76c12e..116da4f491 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10b1282c1b04b236132f4bff416a9beb0e85e8534b2ec864bab4916cdd799363 -size 746795 +oid sha256:1de05659338f0ba3c97adb369a2aa5986c31dcd07738eae0e505e095e68341f3 +size 779305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2ca9773667..f9b79a8ea9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45e0bbbeaa914d0a41b12c8e77d4b43b4ce34d4b0d9101d3c277ad34300a5eee -size 732141 +oid sha256:395976c7f85434025504f67c93d600b50011a188d7aaf314953bbb0fe6963802 +size 760211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9688120248..9e8f285956 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92bf6297b55c2baf92c0602ca906a315cd48b6a2ed2e91e8fd3f364fde5d161b -size 591058 +oid sha256:7d8308413d0edc76185aadeae53987d97b94626faf17fe35807a04891a312930 +size 608572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 006d3c1ef4..7b642d96e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37ce3f3a9fdb476725805834058f83f5cd40995c9699648c2a5168a60a17339b -size 790401 +oid sha256:eeaab50e4df10dd2afdb358aa0c49425cf68cb27ec807ae0d8d488dfa92c0c81 +size 819311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3a963f9c2c..447c79320e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a4fc0f4dd1c409cab453c1a824961cbebff2b29357e53b0be3fb9d784bd477c -size 774169 +oid sha256:56201b42c29f5dbd5cb6c8ed83ec44b140643a22869accf29cc6c3c0ebb00651 +size 799231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f968988c5b..1a581db67c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db158454d66a2a5703bc9d67e80b83dd060755e8144dc7de7fd7614c7efea53a -size 742349 +oid sha256:7a7a7a9f87e35c80f774bd40889354be4283b21b27dbd42eab0ee3b6abb6febe +size 773133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 256f745303..9ad97fcbd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db56cac7809768a8bb8f9c22694d160f1868ee62c379addbaed8c933f150990c -size 728731 +oid sha256:8c12193e1fc8f7eeb488df7618ca0b6d4e8e765edc226b62a1d0f426c16b0673 +size 753249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f2ab27ab07..3d297d4ce2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1e6f2b9d70f0922fff78ac4dd92f40e3c605790d23328ca70668b497c98e18e -size 799959 +oid sha256:a087c049798307479f1ab5253090da1fe1aae064d487bf25a21c160a8cf94056 +size 826303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ba572e058e..b6a4942a6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f4588700082754122079f13279b7ef033c851b5306edab2cd47e4868179ac04 -size 703455 +oid sha256:fd21a0c40cf85617bb9a2a97e6b487e24eea474d419d0fa222d290a30d624a1a +size 731427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index e408f37165..f87e26ece4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63948cb8fbe7266d08e67a7251eee1d4b11655795e24f02ebbb9f2e09e5c59d0 -size 788313 +oid sha256:df979bcbb7ae30e530fc1053517b7cf2091784bcce2cb140cbd95198a88b1856 +size 808689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4a021fa014..cd4529bd88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92adb511183f38cf88c147a8c696a6368022933ae444a5efdfe9f153342ed525 -size 686581 +oid sha256:a50626c052fd75c55c4e6e9e5ec5edac7fe45121bea1b4279d6fc2b3793e6ffe +size 712185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index a87abb109d..ade2e39df6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4ab3803d2801f4337a4fd3dd0009db77b1ea6567da0ec36b3595c1a86d80dc7 -size 638491 +oid sha256:edfd0ba55f95029bf26e794206e332d6d51d3518ae5089a8af1b97d41507dde7 +size 657287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6996464f6c..d41bcb3f23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88c6451ce6b0bf3d5e2d345e2ebb9dd4ad038a45eb4aab7a49aec817515f081d -size 556046 +oid sha256:347b928ee94ca14ce11d29072746ed43d627b6701f7aed845bf0d634e8560899 +size 570500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 76b9cc43cb..1eabff9f7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40e2be96ca179d0259db4bcdbe05ea0e52b163f1483021cf02bba487c733a20b -size 755557 +oid sha256:d5741e8a68d3e48440c3ba6e16b6e9d126b9083e2f8d44cfaacb7c58a9c26691 +size 787377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 794a90d7b4..94411e7759 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d975f07b1417a3fa046eef43a88470347490f7a905df14af647422ad34427bd4 -size 662111 +oid sha256:9c4f50fdf71fb9043a75973f878722e640dadfb74843cf8786ec7433f05b514f +size 691809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 3bf4c3d41c..781cf6c20f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3970c09f77e48b3152694881d727ea2e44b11dcf346556b6dbbdc1f7e0b8c84 -size 745639 +oid sha256:32e1b306b75b6fd104736ed7d1c881ee03260d1d060f0b5c741732c835041cd4 +size 769911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 81bce6e9dd..e6774f3801 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe73a0d104160b14738f4f69ebffa12f5bec9747c72b8fa2e063beb83b73e3d7 -size 646963 +oid sha256:fd52265b6937e998d1839c245f0f7df96fe43f3530db18a65117b021f525284d +size 673555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6825080c0e..04e4b14a42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9dadc91d344f4fb0df03eccb61507a42d724ae1796b240c18932d15cbe2d9d3 -size 720207 +oid sha256:58cff8957dc1580d88ecce6d1fef301d6e21da9825396a171b09cb81ce9b400e +size 753703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 78bfc4d93e..eca3a93012 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07055e2a2175c78facaeef8e721efe413291574ed8ccb2baf9182315b8784495 -size 675065 +oid sha256:64c44bd5e1c79cd5036221ba83ae7b01d10af31168001a11b1aa86c645d5118e +size 716209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 799c9d3da8..17d43370b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:414e23ac2c862220f0b6ebfe7754a4cbb3470c2a71a22ae6be49a96eb2602ecf -size 707965 +oid sha256:17d63cce920d8092347150333854b5a3887be3c23912c496991c1d78c92469fc +size 743387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9ae11df610..0786e823df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f868e73ccb7354ce2d90476bb80b2e84a4a97bb08a0028a05a19bf09553ef169 -size 673825 +oid sha256:aa5f188ff8866e5896407b821841a709b3d1b2e5d3f320a0c88de0f0295a6b4d +size 710973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index de98a1018f..a98e324b56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f61792e426d0c98005d10b049365eabef6f71989b033029913cb0ce2890faeb -size 719101 +oid sha256:779e802e4a8b97fce1e913b9ad4d50e707dae8f90f775a4c61aa043df7722417 +size 753091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8580ae335a..5ec67b89a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b55d4b129f68b6dabe788d953e31fb944d99fdafd5bdbab7d3384123c4380fa7 -size 625063 +oid sha256:550ceb7b1d8db98be11847c6e03c4ee0ad2fbc475083550e9f14c58e82c9ac8c +size 662311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7ff0ea0c1d..f4b20edce6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acb911482121959446aec2cf0d559a280db3b6ac873458506a7bac2c5094c184 -size 685651 +oid sha256:0404d09dc397fe872e695c0159ed67a8b07b2ee2067f8c761e6e0c0baded5a79 +size 723637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 806d4dbf28..90d2abf4d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc9b9af0841c7209b16d50ba19666cdd847d60be89613f78f631c03fa3a8572e -size 596102 +oid sha256:5416cc1b2b4daa1e7031921c79013fbfc2e0d6ba1fe257ceb25b327bebe3e1d4 +size 636753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 87e704f536..63e6fd5111 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55370db6218bdb8c6eb1edb23f4b57f05cbe8aac633390cba0691bdb53771b75 -size 876347 +oid sha256:5e3782f7c5fd0a685861e4cd4bdcd0d184538bc4a2cc0619dce0c8144060a496 +size 920747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index bd6c6c5617..5469d7346c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8fe0a6af65c2696a859b3b20ad0ab0c55eaefc2c50a1da969e3bde3e2e77efe -size 841861 +oid sha256:8e80d2fb2acbf1ef9c2b0ef0f3177255cc54ad9ede86f6b546fa2f164196f921 +size 874173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 77a9e5dc72..37dcd417af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0efca831c1c543d565379d7e65130a144abf879871ce5a20b99583d022293dbf -size 758547 +oid sha256:42bfd2672b6df7269c238566cd918d162375fdbb8be7825e77df985237cfe29f +size 789183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 98c71ec0bf..b38c66c4ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92b00f2a49e808f5e8ea5d4d8d36ce4a82af2f34a072d6179517cc087e43c68f -size 772697 +oid sha256:3c6de87a341b1b325e5b75e331a33e7df841ba451a1ab9447fbf6a77c60bd5dd +size 804567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 62ec0619b6..a77472ad65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86af3767ae621bbcdfaf2bbc5626e5f408a7e1ccd6846898579b12351c4c57d5 -size 827257 +oid sha256:aa07fe841f881e930e05c3f8085d42a6863d4b1312c8510d2f0756e01ad2c373 +size 875703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 1b6b13fe70..411cf8371d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5af9ce97740638e2b12dcf6b7c17ed4122812dbc973ef9cb2aacd2378a75ca7e -size 794547 +oid sha256:bc4fb569626909c59deb4eddfe6a0f337a26ac1d33b431a89dc4b7aa88c412fe +size 830857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index fc15718cd1..99bbc9843f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33768b7abd74d31c1d5a8101604820b11153380ac3d4930ef4d5a5d568069bc0 -size 622049 +oid sha256:49954a147b8ddfd0484e8388e879f7d174ecdaee6ea343853e050e2f7e1a318b +size 653771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5c03f65365..29bf82888c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:885fbce454a38b8c391949e79c09175acf64dcb30e04a205d20cbc9a2a471f07 -size 655439 +oid sha256:d3f7f73e5025d0def1dbe201a9afc5e70da11c4d7cd033c8a5282a813b6385ee +size 687111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 109aef0075..cf08ebad01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:058aea4ddb1ddd0c04127a79db6b81711ecf1a5f833ad99d75b086d56b01ef14 -size 861343 +oid sha256:cb73c7ff2d7b9652ba346b5c52a37e5d352eb0c52d3b7a0a219df08dcfbd98b0 +size 905793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 26c2d41bc9..5e59f64842 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fb6438b91288ff4bc6fab5d3bacfb21779a7eb3830323dabf2c52fa6ee50b86 -size 826067 +oid sha256:910d49ab9c31a998ddbd90f7a1657fbf6a04d5062362e7a08da67345cb0c4a7a +size 860009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bf5b4b4fb5..356fce1671 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dfab8e488084111a62a36a9f2ee3470221e53cda6d97d48165d0cdfce88a822 -size 819063 +oid sha256:cdd1628e9521728514ea9d0067ec7bd0c305f4fdcc39af29cfcedb8d3cd92828 +size 866817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 36c8a18743..384e783b57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0978bf1daad5e7134dcb1e838066c91eb324eb44b36d017178f0f99d43c7ba4f -size 785365 +oid sha256:41d6bd6c610ab9ac42ce98a3a86125918628ff2d0f8313feda13368a8753a74f +size 821971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f82eb74fa5..72ac669ffb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5973bfeae5b41ae913a82d3fe6058e7208a697ae3e83a0d7713a6868260a9d0f -size 873811 +oid sha256:8b350a4745f03af27a4e12bea8ac22706ed0d088ce26cf9c66c8064cc4fc1891 +size 917125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 33d9b528e4..e888eb1f2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3923d43cc5f8edb9e99d288b530fe284ee26cf06d3767ba2082c7cf5db0c4a2d -size 774347 +oid sha256:b5d46a3e6a5b0f686d8dcf322f62834d09bd4404db928caaf975884d3e5efabb +size 819487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 536b80406d..e13e325c04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc1d147092a0833db4b35a303f346528025f98a3dd9be1499cf3c5362c85b755 -size 843863 +oid sha256:77826f2847027a91cd9e9728195b2013e127e8aaf9d9948e3a8b73966bbceea0 +size 869763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 720c37c95c..bff990ec38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:939e20f0122de9740b77bf8ad3c00f292022678cbefff93083f1ccc504416f90 -size 739269 +oid sha256:c367db855dfc5eddb0c4d45d1867b54bed6a97a9fa4985fa9f687e1ebb383343 +size 772865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 569f5393cd..007f2b33c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4343e02e37379391ff34651e57440bae2484c09d5f1d5d6146c4074725335c1a -size 674907 +oid sha256:713e670f97ab723c1a48961b8b344a081f1a268ca1188c843306e23a5bcfe0f5 +size 702187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 58ebbfd1ad..013303d556 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e88182190dd4e29dfe07d1e9990e9f678817c321822d63e83eaba8b7fed156f9 -size 715303 +oid sha256:6bd161dce06761f3ebeb99f0069d4b1771d8da7934b0440304f70995f5bc054b +size 749441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 58cbef0a10..77dd8cbb41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f0fddf1b9e2f8088e908cb8d7da2757f272d42dd159538b9fc0d7f93dcb4460 -size 584174 +oid sha256:8475f1503b3eac571d88f2d4a2d15397ea1ae05fa54af535ed6ba7527447e540 +size 615006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d3a76fd1ee..7928a4208d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8e7b26cfbe2acad88699267f0744177e024ad815ca629ff1fcce83c39e75730 -size 620229 +oid sha256:b7b60a5dc904f502ce0013987ccc2906c15f1e96f5e650a8f67c06d7a1ff2ab6 +size 648843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 340e004684..25f999e089 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:265039d08ed80d292a077fef8f64c1675f3d4dce5260c252d0bee4e93bdddf98 -size 822797 +oid sha256:65c245bedfeccce1c5efba85a6dcc2ea32bff8cc2c928eb1da74749b9ec26cf1 +size 872377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e5e022f724..4e7abd63c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d44e0f904382d954cdab400d532fc6ddf55eacf900cc8680276d2df0bc067dd5 -size 730241 +oid sha256:3665e48d9904aa1242504e1bef50cd0aca9947638c6145212a53c17d5e3ecf21 +size 776713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 690a0a4ca9..ebddab9242 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65e7eb20a3dd903b2c6b3fa1de44957686d5535ddb6932ce675df78573bfbc3d -size 794479 +oid sha256:ef9a3267cc6b5629ca0c294bb32092ff564e3077efffcacbe37c68341f68003e +size 826791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 5fcc051a8f..2d29af53bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e38f37d264fe1f72e2256b364cf0656061562f3d2ec8d7f005248347c076e15a -size 696939 +oid sha256:5638cf098f53811590f38ef42089308a87f66fda560eab5dfade21b76411ed68 +size 730239 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8de049d141..248b017cee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3da304e7cf74d6c1973acc769b14365f96faf50eb5a5b01726255c34acb6ae9 -size 852519 +oid sha256:8a6b982192393dd77d483687665f8fc7963282bc56e16fb2aa99ad7442392349 +size 885325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4278202ae9..4cc65bec37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ab0065a56820aaebb25a0cf39735e9e6c20799f6e795112d01e7706ddc576c4 -size 829577 +oid sha256:e6ba1121ff75914523ce3c8b9eea6210eef1d74727e9cc8e372e0a5fb5183ca7 +size 855329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 871ea61386..fb16e5eb80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25d3a988c9c0fd39ed69cd1f506efe3a649549939071a586ae8db86e9aea6a21 -size 741033 +oid sha256:d6d4a51200e08982a70a4c4ddf7760d44c2de6facc04eccf74114c3ec952344c +size 758547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 53a4d4886c..43fae8407e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cfed792753352bee0912ce964654d72d400b5160181f0d24d0a3c3b6046c6b2 -size 748919 +oid sha256:e250f8599560663a0a70ec73e0cc02a9a94760c51397f399cf9476c9e4461265 +size 769983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 48d25ad50c..d02472f96c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7370cac806b5a065ebc47f319d032e1405eb20a5f66e47172a85bccda139fc61 -size 803429 +oid sha256:258924f29263fdffc3a051f7b62d406895966e8e742f53fdea31e893b815c983 +size 841121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0b13afcd54..1129b8c0e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e86466f2acc0ef029299d6d1f5f25b1bc1e80ac15e96002bb8fdbf963eab3301 -size 782263 +oid sha256:a3feb6b1068debf95c653345eee226916e352a806fbed8b2e1080b6fe9984b04 +size 812011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index b5e42fa2ff..df640d7ad8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4021c7a4ecdae7e50acc9faf3337f46884a2eb47453ec078c85c714ba273fb0d -size 603498 +oid sha256:3de7f5a3d2341ad2402fdb91c0a3671bbfbb528c81b2ee357f3be327346dbf0e +size 623085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1fc5e0a225..d01aece8b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d6de40c3487fc1f8274e08082c5e27caad4fee6bf2bfce8a22820d80a408790 -size 632401 +oid sha256:f305a95d498c394ba953176afa1d7b6ed36ad930d38f923eee23e01eea3e7b4b +size 652529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b91a737d43..c6c1771c6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a91c6492949fbf45e7aaf91be2feead18642c145ca4c9383986cc4807b170dfd -size 836725 +oid sha256:687677af0e1b79f1b24e1fb7b996bb3eb4be9a9d183eeaa0274309817524e522 +size 871209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index bab032bd00..bad5311da7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f98ced6ed3acdceaa5d56f703ff28bb5fa17257e99d67514e02adb3e1b2960eb -size 813783 +oid sha256:01ba45f6ca1cd1bb2088e6299c903ad0290d5f93e0a8cffc1ebdc3523066fdd0 +size 841163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 03cc307f4e..db64c33832 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0851ab4d4840f2c5c84a2521b3e08e8449151488c02d173cdcba493a635c6107 -size 795235 +oid sha256:853c6fef5b658398b2dc0f56a2c55f0ca7d95712e4571d98367605ae52bb9ad2 +size 832235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index fa5b4b612a..70fdf2a308 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48a36b0cb44731a1bb86e8b4d3f1a1da5aa6989cb3eb5d5098cb989410a70c13 -size 773081 +oid sha256:0b23b72bbf3aadd0d9d47518bffceee384300861957b8b500a1879373c49ffde +size 803125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bbfb7043af..4747d16886 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:187d6872aee7ac74f74a1c32d4ecd8390464ef49c5b7ac39b969d50a71a8f58c -size 850821 +oid sha256:646dbfd039ce135b23acc57572c6bfb8ce8279d3d4eba5edfd1d07cd7ecea7ce +size 880667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7145baca87..fc0334ff44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b5644028ce98dad306fd141b5634c511b208dd439fb2d815b3e3a21e34fd313 -size 750519 +oid sha256:4450749ed30cd6891a4b6cb1755297a676ee0018cef044131429556e6a8b5963 +size 784855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 0109badabc..fd210944ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a63e250b0c204779633a075ebfb76782390ae6822c650d5e7d9a87073c7c48e7 -size 832517 +oid sha256:2326a5f6ff64bb9c892558e6fedcc1cdea2f686bf9ee943424804edf2d5d7d6a +size 853089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index f23e7ebb9d..ab67003669 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06545d94b9cb469b462d6963d4331b25d757064779ddcf9008f9b2cf6dfc2de1 -size 726985 +oid sha256:b19cc87ae6750a5b869f937c3db4eb94e21cf68620f0c3784bf203f155776690 +size 755647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index d48dd84d28..ff7641c01b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edcbfb9f8aa7f975969d46effa29cdeb1aaf87ed5067f3eb3dc3b0e2acf6fb29 -size 655815 +oid sha256:8e29cec4ca7863f1ade0715bf3272a6c5ae6f62f8dcdb22f5386175333c70e29 +size 670615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index c963ccd83c..865c4673b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:263b34bf83d6bb5d878a03ba407ea5617d4e52d776c694d882b5657b7d0f0dae -size 692363 +oid sha256:e36377429ead5bb1b810f31c79423dc33c607c13be383114b2fbc0ffe5e2d88e +size 712885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 3ef7988efe..56111eb742 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5df7314469c7689b60c3f3500e34bbfaa1b50bcba95feff2755ecf830f0d7a34 -size 565624 +oid sha256:29757520edcef850b5858d437fbea38946ac30d0fe39d5f7e4deff36e8fb06cb +size 584322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7057f8c2ce..571a4b0712 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:182b819a1d8be1be88c0fd59d9d2c50e3e6648d27852355420a3e29b3e81f6fc -size 596400 +oid sha256:17ad0bdb87ddb0d5a8e8fed1d16f8cc9c905c83999bb9e36720f78565427be4d +size 614210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 70c24b7492..e6a9ef0790 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68a0fcea0c7704568e194a055f533065f5d363abde4fd581bcc7f8318b86ad67 -size 798969 +oid sha256:2b1053e6d242262e7b9dd2571ec2d9fb54c0d8e7d6309b01528a5a26d8585d28 +size 835969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6d20dcafc4..0e47559ad8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aac841f072e2e8e1ac6b304b1a403ef5003716e11cf8c592edb00239dddd137 -size 706413 +oid sha256:9a650f5c97f903c96564b19dca7f3bc28916c80a506b508d33ac810d136eb8c6 +size 741291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index f9a69d7a63..4d178cb8bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c7406ee017beac6416a31cd382e83b31a9442f41096257b0a92fde39228e9d7 -size 782243 +oid sha256:5ba050d1ea72a4a542a58812ef562bf3e430d84bc91ce7ef71259bdbe1be24ea +size 810067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 19544bd486..55d95e2ad0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09c6ede4e4f06455ac5446b5e8050f4af1dbc888b2d565ee9270d1cf209115f0 -size 684655 +oid sha256:47e9e9326a44566a8013b35a11a48323038df69d28a9d2416021da1ad5ea51d4 +size 713021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 61632fadaf..f6f65471e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3ce98d4b0b1a477122f3f746b11c0b1ce4b7b066972c5b43c332fb7b457e816 -size 761597 +oid sha256:acdb58ca951fa75431b12bec54d679eea6bce46458972c84d0fb0d1508901b80 +size 805651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f6cabc8d78..8f291b8545 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d165e1f254d4d77faed8390fa0ab2d21ee1032902d510b7044275b2cfa4da72c -size 822039 +oid sha256:3098aa3b6129ab0eaea5a7b683c72549fe08601e0e0d9221b968928961a522c0 +size 828353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b2c4f5fc6c..c9c817f3de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d31b6ac376c2011c760b42dfca3440663f42454b3f465740d2845a702ed8b027 -size 726173 +oid sha256:40373e75d9593cde6c5c7861284880877f469fadfb3688bc9cd838d3d3cde693 +size 774569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 6b4f95dfc7..cb14b82873 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a8b032e6540d6d2df9fbcd5c2144eb42ff088981652cfce78e7603270de4dcf -size 684703 +oid sha256:bbd9f1de63951cf63d7ae53ecdd817bdaf36b8f5fe279d032c4838e19d3bda7e +size 691905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8ffcad7f6e..7f26888570 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7207b4d8a9d0be9aa6586ee7c005e2cf35825db4fb36335f18a6c713a0ea4daf -size 745607 +oid sha256:3cd6519abc9e146b08bf0cf08e38df2aa2af1fc389dab6b4045eb39a29b5b4cc +size 792029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b9b29e14d0..00bc5f4e6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8fa02f582cc9f0732f8fe52bb04b7f86013e179bfb6ddbbb108b7e75cb4b7f4 -size 716943 +oid sha256:f04de7da084a4a36eea2f15f7a4d873f789364c446b9ae0a0366d20e13f627a8 +size 766967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index cebf5851d4..1870c88335 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:660eca739cac725e36e0132471bf07189a9cf77fb632c05ed33d68194d021b49 -size 757087 +oid sha256:1f766d7be6d1f1641668a6f1880def37aeef3359e37179b6f373e75a6284f000 +size 800699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 77f8879eb9..49860d2469 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad7d83df0232ab88e092e262286150bb6f597a5a0f3b0f5d2452f1510960e461 -size 665961 +oid sha256:48a9433c08de1b97f702c38833ac9429aa3e447f239d58fea0b91f48647a438b +size 709375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 063d891249..a555c1c56c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0848085724387c7ce490770d02176ef90ef8c0aa16e4180999aa8244c05004f4 -size 759957 +oid sha256:94c51392867e2eb49f9f7bbe0624cc349fa1812806d2f135333d7b7f7b7f0b51 +size 770663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index aa26c67f22..2e87bc55d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:428b127930c7998f0ee5e6cbd8641a6416faa94f0816f64b0627d466ccbf59aa -size 644657 +oid sha256:8819f4cac9bb8d212de0c7fefafc3e838ac85497a79b6affd3116109a6e6b06c +size 652551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 65c92fc334..cc07c558e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b20c547af94190a2dc0953404c6b9a817039148f751c9c97f529f6e1b2e24c5 -size 722453 +oid sha256:472b2bece97581369ef8eccf492019f1bf0f5ff3d82efe6694e578c6f5341525 +size 767939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 768458f8d1..42d1630066 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c57d8856d0f9bd7ede107c5690552a7e4cc3a9cc01554ccd7e1284b9ca54c687 -size 635077 +oid sha256:eda3a8416635fb5c534c8a5e0d401886a6bbe537a70a3a8bc4752d070831873b +size 681993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 79a09e6fed..737ef3e71e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ca11f3c92cc8759feb3b7014f1e3b7d3f2aa4f792eb6108a107e9ec29143a29 -size 994451 +oid sha256:99e1513f5bca8c6d4c8101f505d55a3c24949a29a5f26dd96a55ff3df79a6bfb +size 1043931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 0d2f98459b..56c263909b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:119cdda9e46ba401d5b13f8b40bb6c9c6e7e8a72c822255b11c4b803f9608929 -size 943191 +oid sha256:05187728acd2b180277e4a6f886b0c540fdd9956a08a9d2ed6a43101c459720e +size 973333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index aa1159e04f..1f0f417d52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffe486eb2626e9063f4f29600e8e348d8da85a601ef89182b1aa1719cfe91651 -size 888483 +oid sha256:04375dc425a9ee56633ace11c38cc4aa8ae9a8211696bc981f2116ecb103b4a9 +size 929083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 531048d09c..54947d5455 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81c152b7258c38b244599944cad7ba72bf9c1ba777349d9a68f630c3a8a1148a -size 926221 +oid sha256:3d4623b234d703a77a44042daf09964769ca7b44d5f279976434c4d36dca31fc +size 988331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 410f6de8f4..198bf06c86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4882d7e04adb148b48bb76b1db0d19b6190d8fd3ccca05f523b16d14eb4fd3c -size 876095 +oid sha256:f249dc035c6e83444d85bf83d2304a0d78dc9bd507928599cfa73ce8edd12706 +size 918671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9a31b9c2a6..b09d587464 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e8690735910874628898f2e00d38c74e4d1343340a577652d5b98b50c40c5af -size 743155 +oid sha256:5eaeede96a865981bb2658b73c60840f3db8e85617c2a58ffd31bee5e85a99f8 +size 785383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f8605c2b41..118976fa2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bdb064699b0eae3982ec5b82316ad5e55b99e043293ab225fd27bdb7d41c24c -size 971307 +oid sha256:1a5488d80a04b36ed700eea952916a5d1e34654314337872df223f74828e13c4 +size 1021133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index fa07088110..d488aa958f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db8968783f4cf0ad5a656f803de655526d0f8c8d4b40b4acabbfd4d0ffc004ba -size 918469 +oid sha256:6d2a7fc79a635b71ee1e3a50722784627f8718265eb304d3f02dca4697e422e7 +size 950535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dcd8e302d0..c906668d38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a2699d1d9dde2818e93ac41cbd80ca05a148e851e4cd06bc25315a6667896fa -size 912697 +oid sha256:fd3665ab467109704fcb0ff0a6617cd87d3b849b6e56fc8745575ecb49100531 +size 973771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 4a81f4f5c1..46660b632a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b1d5fae7b3417da3d34aca18ad69bd0a0dee772c8b55cdbcbaaa7f2abc594ec -size 862277 +oid sha256:97a8f8d556034c130b8477c896a05e71ab785ca676125dd969bc3e4c46bb4050 +size 904901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 48f34982f2..6b59950090 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae898f9e639aff08b62d31625f39968b333986874c8468cd1510b68b49b94264 -size 980913 +oid sha256:4318501c05d87ec2667c946abc510ac69ca33df56609aa5e468818464666e729 +size 1041199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 861bc261a2..661973988a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6252f53dcc06d44aba2aad2f1177427a3626b750dd21cb38f257c41200dbcc68 -size 892303 +oid sha256:d728f59f99846951a99bbcad88f149383df3a7dbbd5485382c8fb8df29284e66 +size 935223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 17c1b055ad..f9b1d90585 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:482e692b5f4281730406e3c966902ccdc1346ff9c09e500f953498941bf2be53 -size 933501 +oid sha256:3c66f372ffae3489a57dab735e748a5569c888210e21e451100f171ffb3485d1 +size 970601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index caa350d8a1..8b79137035 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fb131ad5208b631d3ad45be5a8f491fc6ff9401269b1f5e0880f9eb2be2d7fd -size 840007 +oid sha256:68f0d8930924d5467362195d92a52d2aa0363ec471136b987e7bf859aa16761d +size 865413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index a3f15f8028..33355b04d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6029b687196aec76411a22e60cbce1446013e9a41ff563ec04ba14fff7b0bafe -size 798725 +oid sha256:59aefe753d6987b19373b97f09349f9d959abb0883ba9ebedde95c22da0bf5dc +size 842731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fdb5a6ee06..bf1a4c0c29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e63fb355cb83500ab87d4d36d9f1c76d8dfe51c3b10821d09f9c1bca6e2aa6d -size 707993 +oid sha256:7c559e7fd1352fb7dd3f7d5f4d8f76451cea8f9fecd0de35a8a0d90bf641c25c +size 746967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5f8bdb0edc..460d5f2089 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54a342a4d6d39ff4ba4e0d885b9439e8ddeb405c6a12503a4a72f49ef0b24209 -size 914705 +oid sha256:52ef5ba4eb9f75545a5a9b0c7c63d152e7c97b5bbdc897a72fd9bc4e69f6fb87 +size 981009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e074d3d2c7..7b5a05a1a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:709409278f50c77b8dcf6d3ef9c8da98ab5de980159d9037f8f15073be8acc71 -size 825897 +oid sha256:976068282c6641f4d2f2b5d10b3fbc293438c0c4597a9cbe2523418881fcc2ad +size 885345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index f592d42da2..ce581db3c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fd4dc1764ba6e38d2a1c26e9d3b0aad77fdd3651904720723cdeed7f185b25b -size 868923 +oid sha256:d8009788148e17a29d674a17ea264a3d50bdcec2d73bb450fbc5b85cb24fbb8d +size 912139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 231185db41..64bff293b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50f045f2ef72c35463c41cb0346d1248954c01320681081fcd3914d0e9f2676e -size 775279 +oid sha256:a3e6dd45192db6eb0bdbbd05ec82a8031466925cdd962ecc0f16df223cb52493 +size 815683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f45b6c702a..4d5758a8be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:959b2d0199046d89c7c3ba4a9a5bd3a1e5e675429b1127f441c7591eb49dab58 -size 964259 +oid sha256:7377f091458799e7efba7f09496862a161128c232ee91d37f74cd2ea8e242883 +size 998003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 113836b995..338f166631 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b130bbe0863be3b80328ff1abf3b5d8dc49196d35b50b1f0b37417aaa7fd306c -size 926911 +oid sha256:ed3dbb09f6aeaeb1f92e64786dc9eb6624abac18e295c9fef894a17612de5e11 +size 948815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fc500aeee1..bec51f30e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39f2bed5104e9271aea857351b05a6813837f4de64a97689d38e067df9cdfca1 -size 858339 +oid sha256:3a78a348591c939e2484890c7e850865a37ba03acd733995e9a1b52576971b90 +size 883993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 76a8b9ec61..1c8eecd92b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:253c2be8f000d0e8574346cd01741cc0e7162e6e533443732bac25e32c6881a8 -size 896077 +oid sha256:9a96294fd86da696f77ddde526f865baa27b6e981a45b188dbceb2d36a8d9df9 +size 942401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f8dbdf80a7..78f3232d54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c5386bd771f8fe61f5ca87c2aff60486899c202984780a5d5dadddeaaeab738 -size 859865 +oid sha256:27a20ca997786f2609dc74d1ddad17d3a527292bbe1e1b42ba5f4fe0fa05f080 +size 894941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 427afc6904..222c3d5b92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87f582ae66bc79db6bb38b140593490ce16efe6edb3bb733c0c50ff17def52ee -size 713011 +oid sha256:5aa6e80ecec5e76627175fd44a94cf94b6f9e9829f9e847a288f11f88fdd2e88 +size 740243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a5ca4b88b5..36d2e7b812 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8380e177741ef5a0e17db0b9de920e937ca1bed2c91168c245c4489c11dfff40 -size 940325 +oid sha256:6079ba1bfbbd93263caa109d1f3f8e81c6dc192d44017a8872b62d66c0c4f6e7 +size 976043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index e66fee87ec..a771690061 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acba93da0c86c95326c8bc63e3e4251e9a7067cc51a9d668ab3caf8626e32622 -size 902189 +oid sha256:611e3b9efd7b53893a61dff59720c105bf1dbf7be6f2677643a8cceeb3a707de +size 927595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8966fa7372..5e60171f4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13bbb34f189c2bfc9790ce79d0217d3adbaed027ff80c66e7dd7c1ac2ec87b10 -size 882259 +oid sha256:e2d52fd2957ecab74ab22ba376577c5bfefd5996706366be568f34f2b91345ae +size 928631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index fdbc2689d9..7d58212a40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86ad6d7c4dec6ed569267f45af95ec05a20662aa3b741613a9c32ef455c6384c -size 846045 +oid sha256:73bc9d37b24c61c54b4d198baaff3fdbbc26484dfd7a84a40cbf87e84a276770 +size 880381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 36875092aa..7bfe3e0882 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55c28199c08bbf6ecc0d9fe681b3af17dbc26660d699dd471a777a88c18fd539 -size 951511 +oid sha256:2f2e5bbd1717c0fdf6cc605f2960e73d9af513ab6f1faf8fcfdf025fb03cd0b2 +size 995023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2f0adfc391..40f3fecea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e119fd99197633e76f8bdb1cadaef321a7e17cba0ee2a96d6ce0a3f23519111 -size 862159 +oid sha256:fdde6ecc01feb64640376de87bc60a2034da78fd8e9b3e5888e58e46b043b6de +size 890871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index e25f1c8540..b5f82e6111 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fc6ec77b5e5e2d9066f1b23d4759866acce2415c34fc2641164fd56c3403bed -size 919541 +oid sha256:1661f5ba3ca8d2c5c62541f470e89690f4ac1821d82f87ef848dbf5271e290ed +size 948253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 29c14e57d5..d93ef777b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:657119b88227f64e9bfa9370e087f1cce1e87382f682d1287617e50eb49a9ceb -size 824517 +oid sha256:fa399451471924026349ce0b417ead82d04c77f7a7dd4ce1eaaab8f950ccabe6 +size 842523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index c89ad8cfb5..5d0cb1d223 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b40ad049f30704a593923a891035276fde9fcaab92a620f428c6ad9dbe565600 -size 769421 +oid sha256:2093ae1e418f24a29de397a16d04d3be4288e7f36855e84f244f2b0d52c8a53c +size 795765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8030f49bc5..3a0e967de0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2abea230dea132ffa657886c72b79b5a9af4e059b4df02e63bcd8d94afb8fae1 -size 677801 +oid sha256:f77355238eb1c6d1ccb35f2171b4364d653a3dd14451f3837d9888f60bc56ad9 +size 702665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b45821fa82..84e9661312 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6fb8c3a2c1b25196e83b0bced513f1c009e2ee18ebe5bb8c7981f16687d849f -size 884513 +oid sha256:2388c42587d05e4fd546eea6714b2e72b7ab0afbaeb85297fe38459beb187c33 +size 934833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index da3b5e7a00..839de50c3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d96d682b46ef4f6862e28a39b54b4cb15ceda4af1bc50fdd938d3f6469e0973c -size 795705 +oid sha256:f0aecc2bddfa397dbb566a3fd9c2be18f295a20b8082cf406d434686b89f00dc +size 840253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 120b18bf0d..9bcb28ad53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f169bc221fbbc07cf3da0d3b02acb700e5e16b488301b4c6fb53c183a5e70f4 -size 854961 +oid sha256:e3b6f3a4c1afbe09f18a589d6acc5569351f84026f39ea5a65f8773e78ce4ac7 +size 889791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ab356af503..2a17596557 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6633e5d4e2fefb526e1066203e21aa9eac54a66da7f049e87c7097d19ebcb69d -size 759839 +oid sha256:1f6773dd38f44e57cab2c2c260162a887355d52df318292fae86d3acb7e81a9f +size 792793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e6809d20ac..914af9ad52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:881d1cc5825e0d022e8ffda2a480338a2e1dc96638e6f193d1c887235e0251a9 -size 840925 +oid sha256:6bac33fc573bc3614e8349b4c01821e80740b11574c3defc3967dfd60b8013b2 +size 903875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 842ae1adc0..8eb90bdaea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b501803f9841699c9df884aac325d5f36589f495ca7c3d10e209517312e23640 -size 798645 +oid sha256:41b6124bcf453a9331aa49166c1c048bb47e544e74065522040303a628c104d4 +size 864751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f17c16e3ee..a032609c26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac5dd3beead5efc24e8bf2684501699a62a31f4339c934ca03fce15a92a9dd31 -size 816597 +oid sha256:7ed7e75d5d098c2e444300231e34f3a261f7e45d8921d6782060522e891bc9f4 +size 880829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3b49898ae6..977aa8616e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c144adc5d0e41e4ba22078b7a830d5e09c515c714e8083f5e0981cfc99c021f1 -size 784923 +oid sha256:0693f0e879b550e3ae063ce16567b3a895c9ad6364ef7581f04804c8e6e5d2fb +size 852559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 94083a9b85..41aa207ee7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a6bf172fe586249d602412f9be6e76ade0085217039a8723d4461ab4ccfaa2f -size 831975 +oid sha256:72b870e8b46300a295a1b31c75880a2d69eb1b19fffb883759bc41ed12610c38 +size 880963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6fb61822fb..afc72f9e67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f5c388e42056ffaa3ded27dde97ffabc91a20f50c82a2d0471f7b779bf1c130 -size 737987 +oid sha256:084320cc522e4f6bc704deee23fbdee10122369ae036830ccef4bf4aa321af8a +size 803699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e1fbe6b07f..7ffc59de75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d67d76543be9670207463e2b47b017056aaefd8e2a911cac552705287b47ff45 -size 793839 +oid sha256:ada3f7321be01053d3573eb5660e2eec21b0c0bb3d874b26a890974ba21ce7b0 +size 845491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b7ad9eec79..64a426fbf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dbd09f62d8103757ac8433ae329d1f9b1196d69b36fc870b9bb85927d995394 -size 703551 +oid sha256:df85ea5314d837f281008df1a209716e9071d26eec887f691f34e2cd6ac0846e +size 772519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e0a07171aa..d4d9433de9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66bba5b1b318f171dd51677fdc9b2435ff233c50998764bf267cb46e223fcdd5 -size 657311 +oid sha256:ebf7c02bd027dfd90a8195a6c93f14f87294ff1a9093c12f29a6e7c10e8fe733 +size 671717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3be1401154..0dff276ac5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be60d0e47ecfd7e9e7d3bfcbe7ed3083cc814d48d4f6bb245c3e86a9313af64d -size 603092 +oid sha256:c7f60a61f2c8b083a432ee0d3c38d733c28d57ee9c23f5784b02b077f83be4e3 +size 624109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 93d636b07f..165f5e6572 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:468681587d3de08ed897b87721c47abffed186d5d28230a3ae11615bec6b6c0d -size 647637 +oid sha256:e9d12fffee090eed5f7780f8eba9c77e8c6f745d84c27eaa98c5a8e93bf159d2 +size 663177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eacd7c7ea3..9c96343000 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9820c31f98a65a5c804541df660240289eb0e1338b23826bafdbfe466ae95f4 -size 598546 +oid sha256:2f2c6e967a56e6357d056c85452c128f0683d537210115a583dbd805ce904bf4 +size 618627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index aa51ce727a..f1a1c68fd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b3bdce4712f04ec75540b2a5e0598c9f498aca1db9b4759647f1bdad7049fc3 -size 662127 +oid sha256:3a32ec042bb6cd4a3aaf7a2d7ef1d1fe783f7ace07cac5c15cc5e3cabf121422 +size 676679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8f7de231ca..8eaf647867 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b94aee962de0c3c14c009d31def46929bb4fef112bedaa376ad22787c9227892 -size 563598 +oid sha256:f8984e7f633551c10f846e53723ec4e979361caf4fd4dd4784281ba11e11e99b +size 573070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 351b6e0a50..8202ccbf6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6ee370258d1ad589665efd97e85705ea948e833390800c8f5bb652ddd70f5d1 -size 598286 +oid sha256:edb3917fd49408f018c53dc005853a157b709355f2dba9c4caf406b2b33292e1 +size 610816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4585a53973..a809852f29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71c6e70b8b0ede514d6422537e16946e1c5772d6ac8710bd0ecd2fda5f9531c7 -size 514312 +oid sha256:165c8df2dd29513c8ff45242e31ceb0f23fa67320d3a0b6899a9c4c460846295 +size 530296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eae342ef30..77f93e4d80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ac6f1b2e9d1c47c3d21f4dcaa470a9a2035019c1a7bb7030b53a9be303bf768 -size 650159 +oid sha256:d087f1cfb06602fc855fa57ebc3086f3a2b5124f40e423acfff1b1c97a990210 +size 662047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 59daa45c88..f6f9d68d81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01bdf13acc88655ed6860be5c24ac95efc0bb452559d50d6a8240efe8b31ba8d -size 595938 +oid sha256:60497bbb70ec3fbd3b941f2884564153ec912bdeed11d5409fd07314eacf5ca1 +size 615228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 77165800f3..9bfa298b08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b340f3f0f0ea7a3fc175d80972ce6d75c1af0e8802583fcb1dbe793f7cb824bb -size 640483 +oid sha256:51a19e37508d4ad1a04685da3ab28266dd4296ff4a1836eacb16daf49e011225 +size 653507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9d5fb1cf22..3f07fe5d5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80aba882038176a6e9c3ec1ae0b4006b291f2ecdbc75476f8c29ec954fc121e6 -size 591394 +oid sha256:5a0ddbccaf4281d766489d3a6752b4fc4ef658495ab57da5ad18a9550dee1d40 +size 608956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 58291a4be1..8feb0a8aaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1abd3fe351d9acf0e62d22c0e12a4309b1f2429a9f5c5ee86739b0a8d0eec386 -size 654973 +oid sha256:b241e07752e2e933de16d9b36ee6a751708ef2b536a566ac1dbca9b531db1e66 +size 666961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8e00087e7c..d3bc984b5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b87c1a3fab3a27b383e5f0362d8063292fe9e16597fe93f84fd0e17aabde3d8a -size 556446 +oid sha256:c020bb8db2ee0dba6081a17221104fde06a079d14592d5e60ac9aeb9dff3c869 +size 563402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0d6d7da2e6..99638e8b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fbe200646629be8f4a74f86e363ff9517e63d1e29dfeb543726495371390cb6 -size 590244 +oid sha256:dda5f68a7a81ede47daa9e8952f3136e5d4fe8264f7a9caff5024603d2c8e0f6 +size 601888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 335482d10f..9bd385ceee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5add4fdb9317e0f939cd22098e2b20805f07396ccecd072e818867f629af665 -size 506370 +oid sha256:c7f8733e716cfb317b6846df96b54c34419512ba82e8125530e13591628b6e82 +size 520578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c192602569..699f662c05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb586b71934da7d9407c36bd224aaa9d44e3c9f901341bf92415df37049daf6a -size 677473 +oid sha256:1553f15797916708caab1ad59c987c0645c7c747e292a04f4d7ef7be730da679 +size 690941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5292dcf78e..a4231b8124 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a4fab1c75bb373e53b754b95c0ecd391fa32c8e5e653b30ef83041a1c7d9e52 -size 622613 +oid sha256:374b47151103c7c7cc7f58ca58f816522377dc42ba427d8e938c679df50f837f +size 641705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1fa6196a56..5316864dfd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a348da13baf26fe37f9b95d2c0bef88b27fde8072763dd74a0407d0f92903bbb -size 666219 +oid sha256:a22a2604ceadadf5fceb62d29d670791b4c9dd2cf0623d3bf276a274c39e170f +size 681563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index de2111ef8b..36509a4c3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f0640bc8e706b7764d2f47dcf92a95fccd97c93231beebf7c8c02d31850d8b5 -size 616588 +oid sha256:4e7492506d8923fa1aac9b672deebf69bb698b7c11882c13075fbbfbc6c11f65 +size 636273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c33750e17c..294e49006d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc7c951004933010e2901fda9ba20c3fc0ace98eb306c936b3701c77f9347fab -size 680217 +oid sha256:1fb7cd2b4b1ad1dec57e72da35719edd2d915ccdd3a9b3baa9a8f34dbbc62b93 +size 694917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ad30c160ab..816ad9c093 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fd6431cf05f01ecdc80f87a739a645a0d14a4e0b4bb0caf0fa2e6b09d3c58d1 -size 576016 +oid sha256:214097985f0def629a7c57cc850279e98e7258328237f507dd08e19578cf8fef +size 583564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3660950c28..e258c4042b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce8662250b207ae409ee4209695d2d00e776e46fd800a3c5c32a04b2d8478375 -size 623087 +oid sha256:9833362a9c22a5fd3072d1e6c335b4ff9a1259bffc972cc0a53297ea1bf36638 +size 644843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cf6bfadf0d..931e251b9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:becf993ff00beec16106980478549da259c52864a082fcd9b7c5846bd2e5a82e -size 525990 +oid sha256:1045a9fdd7dca5e37948000e9af2b394f1d9536524b9613150d9327d6d2bc5aa +size 540740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d28d9f8450..5e8a096e13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3db845f6d7f21cb7a85dd328ca7cc76cbefcbef889a561f9cd11708e13ab2699 -size 670369 +oid sha256:102a626aabe400babb192a68781f19fe2bed742340643a1a0141f5eb551b0337 +size 681223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9277540e5b..5de6f7ed44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cbc9e263fb81737bd9fefaaf43d75cdff0b6fb12f4e0723b578f989ef3992f6 -size 615458 +oid sha256:da129c568a6bd212f4518002bf191cff794dae0dc9fd6867be1237616c964105 +size 632035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 019c8cdc3f..e243b2683e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37a442c04f6673c36ae946d25d00beba4e1f381d0eec6bbf9199f371b64373f2 -size 659067 +oid sha256:bffb6bd9ee861ef90c2340008866c06e8b6df3e40799e47a8ce7fc7796da8150 +size 671893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f0cffb7b66..39d24bfb38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2dead896a43ebf5185beec420a31e105f7f01887b8f108a729d955466e6294f -size 609434 +oid sha256:f876cc94661af72a1407f12cca970c55f0bf184dffd802a35078606d58e08c31 +size 626553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a7939d3c93..d1916380d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09c19a8b473727b4fad01d9d2742b8c6fbb4a954019eb8ff177dddd5cccd4aed -size 673063 +oid sha256:36891ea31c6d8a6206ee4d2c54958eaf31045e1866a3ec2f77db0982cec808b1 +size 685149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6901128e46..a9b0e152e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f68f29670b85b541afbf486afab27ae93411c6efc588a57276c587b40c10cdb4 -size 568912 +oid sha256:5f863f795c0b8c2197c9931acfa4cd15b757f1e704e425419f2134b3ad3290a9 +size 573844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e0ee44f54b..135fefac96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f7f4bd9ab83e2242508b31fb76c6971b5dee28a424a2b16bae7fe088a95daa8 -size 615932 +oid sha256:115b6688f0ed7bc78b589a8cb47e990ae55a175b06f391a03cef677068a799f7 +size 635123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5cb785aa24..692c91be8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a64086dfc6033132da4016a39b7e7b04be33ca37e9eb2a97121f9eeb43e77d95 -size 519626 +oid sha256:71aa77dc50bee0e6ca6234f4c4ad4af33990bff6facf314bb56fc4937c719bdf +size 531070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ffa1aa2baf..6fefad07dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dbaf12309e608a92a2ef0d0e7e9b169508c675fd7983559df17ac265c725fff -size 756521 +oid sha256:50edd8d003d0e4a05678b4722d94e75b2969b7db44a19cb1b6d13c5a127e610c +size 786911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 41322008f2..8b9d11f4f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf74bc864a570c01c5d30cd71ab4a461f15ad1b5673d040018eef78ffbf74883 -size 706495 +oid sha256:4b6e8b287e79e891b2f56ff1779ac67a78415ad41248a7825723af6eec53d0c9 +size 745173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f147c98b11..bf93e1b1fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8f08b317a1d64e9fb5ae59de23b7ea577aba1d0d8a706d5cb14b2f55b50d05a -size 740333 +oid sha256:b06003227372167a5c64dc8836bc36e024373fa79ac54b71508d29f42cd6f800 +size 773437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 06f99e640f..b7b58bc039 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99b50959fcdb726b28d78f34440a9a7eb8ebc8b5c695dad82bb62b736cceecf7 -size 696819 +oid sha256:6ea40070cb65044284ca7c609837ecfd6ece606ac6ad870f893cf7d6300c6a4c +size 737027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9728691222..42f55a8a11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7203bbf53587318cbe5e515434e434938f9a73afc013903b6e4aa34bde4e244e -size 772139 +oid sha256:fc0d99cc82eed7b40895a44e795e2f7153d82eab1b4aab20357cec9146f3a5a1 +size 806179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a4664f7f2..e733f5301d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5836f7ffd41f5e4ccbaa82e8ca9d45a97a761348303bc6589a509ee84b2ac816 -size 673465 +oid sha256:60b8c8b3e99011557bbf8d565f36488c80f931d1eefa27eff5ed7c230fc6c7ed +size 694087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 385659f5b7..1efc8fdcfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a256d02ebca8c8fa614cc3ae9321e9ea6662f34aebfbf37652ea4a75f490b4f9 -size 698977 +oid sha256:154099dc9ecea5a2c3cd7a1d6c104e9a80c1a5e49895e9146e81514380284d1d +size 724729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 11cc597a55..ede0087496 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3147f51b70533e228ee9e7ed43f6e7700b5581578bc4f60795c5c6852b99d3dc -size 615200 +oid sha256:c1a25f4191283b365235a1fab085d75d32fbda1bb488d36f2d64858fc36bcfb4 +size 643173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2171ca7edc..c33dbd284e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ded2ac04926f709177b60251c9152745c7e064e612c0dc40aea849e2d6df02d1 -size 743053 +oid sha256:60cc8e1c83bc0f20826824aefb0c1611624941e9d0307baddbbd2a58c605b9b7 +size 767571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e09ed74f2..98456e6aee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbe9376408325a2e1ac0c537234b2377fc55f6e508adac92e77042abc0aa71e4 -size 692189 +oid sha256:61027a0ea6d91076845fe9503bd97cd0489b4db9ecc0e3fc8c1358c1c0386226 +size 725785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index beb2813a18..906a7b441b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8945f593addae870f9d253c4b04be17efda54726ea335eb364e8ba369c5bef9 -size 726865 +oid sha256:5eb4e2ee0e84bbe1e069ccfb0bff3f93ec115221480e4f14594933822cb65f20 +size 754097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2caad3adae..610e3654bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e294b13561ddcdab97bdba3a75ff48264a44599af5b56be5b0c7a2ab55d1a322 -size 683303 +oid sha256:961d9d2e5163cb7f62e02c497a4e5d7ecc11551f40989217480b2b6e4492f28f +size 717639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 393b5f4d5e..23aab3e2f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:041d3bf2ff032052e54c89d4b679aaa8753994be6adb2793b71cc338c224acde -size 758671 +oid sha256:403b2dee1fb99380ef61d56e4b207a532484418d7fa12d8dfa1b574177034472 +size 785953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e419835b48..ba3902aee4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e6f0ed25299969d60ec77475e461a9be5aef7f63c1dcf50a40361e1510dcbef -size 659997 +oid sha256:f001ab93ee1fdff815d8d3066020b467c62d692b33be4598510daf7bdc9bb9f9 +size 675537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 56e8111abf..866320b549 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93225a851dbfcf7060321a3ca0eeab5ecdf064bc36dfea00610a8a2faacb1ce7 -size 684817 +oid sha256:0acf6009e96c23cca6d962bc76dfde8791e7e3288f54f1aafa535754adef8cb5 +size 704403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f40de396e2..086ef06b39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46ee68b23d5785981ec359eb3cf058903d755481e5348f846ae25518f84b8281 -size 601682 +oid sha256:256a76023a19d0fa5deacc8263760aa998078a4672bcb418d0fd014ae57bd56c +size 623833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index af052aa8f7..97621ac6c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac4fbddf30d30036741c0f0f310ae088878865e896aa1c9717c77ddc941bcdb9 -size 776881 +oid sha256:229c2a01874b8db611031cdd4d394790ac19a5e7f76ed3a25c8c734fbe375c5e +size 806925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b9ba2b77c5..8a1ea6fceb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94dc703791cebcd3a2d3fdfc3b7c09d8e572ded1ff6d0119aa94a22581ae9e42 -size 725423 +oid sha256:3b21f98a53066a5837ad200863ad1188552615c10923139ae79f4916b3a7e375 +size 763559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3cc5ab0e47..eedd43ca41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44f0f8885bdf0599ec8ab7c9b5b0b32335980c39d86d6a6e430b357769c8702a -size 760693 +oid sha256:7953ab5af189efd55cff779ed65829316ca0c4606d501ad75eeef43677db5a65 +size 791871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d4ff713900..880f45e276 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eb511a5ca57ac7b0f29d293bfba44cfba48d1c569b33b30a09c9a008f494311 -size 715749 +oid sha256:9ec3dcfd7b16720256dceccd858b0c0a87a6ad276193098085db760df155af5e +size 754623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 46653f0044..8bc825db3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f639c6fea806d6f48ea1443f8b64b223b4fb1adb37f1b31828b4de526ebd1e50 -size 790229 +oid sha256:d36fc0ecfccc538a021d43158cd11e21bee50ad69b8a03491409f062615dbb03 +size 823629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c43ace742..421193902b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7fe6becddb4a507cdd9408131902ce4db4c0840d25f239e98a139d7847dbf1b -size 681737 +oid sha256:a02c458201a8eeca746ac409d12ab43b2067e7f6d795af6acac3fc3bae88483f +size 702359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6e63cf7561..cd73d75d39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daf96d1082bf83824aec2ff1d3decba78f3e799c41a17bfddd5538b71448ebf4 -size 725107 +oid sha256:2e8f29f141719a0079f52ad7a1ba61b41cddfecb72bd0aeb4bd7739dc33877a8 +size 765265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 94f4b4b749..a8b00a9eae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c9d2ecbc3193460ec9147de6ebe991a785d9ef92c40279efab0b45cb0b28700 -size 623375 +oid sha256:5864c3ac7dd1856c7275b18be12db10890748ed6babcdfc2e7ff4e0fa0017262 +size 651445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 60a87638f4..4757b71b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f29589e9ef997d80f0a91abd382cb625f509d98ffcdebdad11fe7b774c1d3df6 -size 763363 +oid sha256:0fae445db527aebf301bc4318b923b266f255eebdd683da9ea9b86d04458017a +size 786747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c19de320e2..ff7d83fb1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad227dc8b804ed92a0d6dbaa9a8dacb3b956b3dede2f0f574c8a0ac67dee4ccd -size 711907 +oid sha256:9b1fa2dfdeb872035ea7e61ddd7a9a632c5818382cd7dcfe8b644927a01f522b +size 743381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9277f51500..c13a682e5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96848aa2274680150ca40b1be64cb648c61c51afcad2f1b22623ef2231cc241c -size 746387 +oid sha256:9de65236ff300eccb29fdaebef12c0fa7c528af9f49e8df61c0ba8348c263324 +size 772483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8cfed822cd..75cf61d580 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a3baa7f521c235fb991efd27867c766005df8520cdef4b92b45761f33813062 -size 701441 +oid sha256:74bb2b5c8efc048f2a59434c7d5a15cb73fbac4c41ba46f805ac2682ec741f55 +size 735235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 98e2b293bd..88d720e93c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a4afa78255aaf85f7a835280b0e5d9ced259b27c7cf8a5249e9ca22a305274e -size 776713 +oid sha256:731286bae90637ddbbee12bdc86d51f8710a4482e8de900640bc16adc3339c97 +size 804141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f62a00e71e..d8b2c3be77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fc9c19e7c798a3180bc537e5b453d433d95df8ac08ca065571d28cdfc706c3e -size 668269 +oid sha256:687a5577ce032a48415a0688c9b8cec402e6e1ea93d2df59d4f6afcca87f7655 +size 683809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d39bf8d00e..fc41830003 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f8e455b10a4e1ae218be0cc2b8e23ec4c120ff03566509a381fe9dc2b930621 -size 711591 +oid sha256:b13c56c8af7465df13f26ec404274f65003a2222ad2207385dda7f5940b0fbba +size 744989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index da3d70c60a..c6f68e7e16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f95fddb265072c09027697ed4245fa85f37109ec18b40263bb5f0f56f4fbb75f -size 609856 +oid sha256:79951319ea6d9f938297c1d9b331a98eb3f3e4f19f66bdd7514740c899c5059e +size 632107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 923c378617..d0d3f4b6ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae7759a528b0a28b0f70f60fd8ceaf6478a55c7927142574a67c97747fa6b4b7 -size 635553 +oid sha256:e721f8f640c7de756d2018ea9030e892a54158f6b76996bd7908d9b0dc1a535d +size 649071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d1a311e60..cb2bf752b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f0cf0f57afbb77f37a5b681cdb3e0b868b0d260382c240fc8a7f33557566e38 -size 602990 +oid sha256:af53530dc87895bf81a380972fb496494a2f23e0baa02faf9fa31e22457c3ea5 +size 623959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2fdb013201..f12f95f854 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dafd64e2d960da014ab928cb33c69e40406cad24ac414504eeb012f6df216e31 -size 629381 +oid sha256:f29eed027854e38c25d30d677522428de232223b7c38e748ede00fe0320ef2a4 +size 642799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1929822f33..f1566f91a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fe3c04867b16a657fcc95cd70d2a51045d494a132c9a47347a58dee328e9af8 -size 597656 +oid sha256:8da4dd1e80520748c294831857bd997fecf12fef8f5d033a2f5ea3a5d8f19024 +size 617686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3bd7f68845..d6f8e9a353 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb5adfd40e8371e6c121c9db91b83f68869bc0e4508bda3ac5cf697a8e55ebfc -size 642589 +oid sha256:ce87fa92200f303a2bf74df73274a9e7370ed2dadd1648e8bb9c0c47535cb7d4 +size 655169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ece46020ef..9150c579d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b10764b80ee672754419cbbcd528c742d40a7e418adbb19b485705edde87952d -size 548846 +oid sha256:130dacdb40e2d0632715083386a7a6a628d1baac29ab4361677c12d66ffeecfa +size 557874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6d2a090065..1e2f169395 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7ca24d84a2dcee90546da0af926bd7748a7848df85fb73f5a9d59244dcbd0e3 -size 597446 +oid sha256:5972d633bdd07052f65d1568c400169ae1f2a088b2597f07eb088e4b2a3c8811 +size 608200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 182b771c92..78ff9f5d17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1fd2da4d2f073b2eab95d39620aea19263698b51fab33a31cb889f471a5bc5d -size 513718 +oid sha256:3da8ef7209e75a3c922f68855f0ad2d7a73727522da8e364711b7e7cd4629a5a +size 528470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5dc0b2a4cf..ec0d360b97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9787aad80f6da3a05f12435de7fedd0666139eca26a064319240bf279b84ec72 -size 628401 +oid sha256:c3383225031a38d834300f0380599dc9868863acda8f2a7831316d926c2254af +size 639401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9b9de4d504..371e479c70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f53c0d67a9b9c9c38559f0b29799e06e5cbfde104e49f5020546c9127c10e307 -size 595886 +oid sha256:94fc8e9edee296e70ea29873d572af272c7f62105cbeb975924e719566937ec3 +size 614238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0cdca3df71..246cb7c926 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0239b5566e04dcaefbbf14ca953ec253e0d02f27218cb744f707aed7dde6d135 -size 622227 +oid sha256:a49d772bee055b3365c118b3318a9463092ef40eee1541380306c48b72406d6f +size 633131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a39d6196d9..f41b1fe41b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3fefc4b915cad18712127a4396d740b34d9b90857a4803b6b5e1cd86435c9b5 -size 590552 +oid sha256:49405711a2c16558a77c8f14737c27b00e28f0a86f28afb85ca7763823579fb5 +size 607968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a03b4b4c9e..8527d6c66d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccbc73bd1dce98bf11edc8842a4a9168acd5b4aa09ac954fe1260859a3f942fa -size 635435 +oid sha256:66f2be359cbe881b0c28c439d0e7e1216aac2e5a2413cf3fc67717ab7cdf3107 +size 645449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4ed24849d8..688caafbca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4e985e08caa02b1d32ccf683758852ea7d3e93adce8126c1be000ad617314ea -size 541692 +oid sha256:1791f48e0f80bc6e6c5d3a2d82a8fba59bfcdfe1c395c3881938fedeeecce044 +size 548204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 336e7eec42..87b29a07c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98a59b0ed017492b4dc6538a3b91ea969f0a8cf05ff39106e7eb7652b32e34e0 -size 590390 +oid sha256:0d253f8f238d3abaffcccf57fefef29d1649f772ea4645ed663ffc47515d5bce +size 598530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 40b9cd4ea3..68419b1fd5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c58ad40b7f90ad9bb66574bb751502a302054fcb4b9bb7c33cf055c7938edaf -size 506566 +oid sha256:a5bd2d3153c908726aa37abe9f486a5e60a03048e72313d85583d2a944cf43e3 +size 518800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 527a952f5f..4dac52f9e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2d338de0c250262c6be4bc698f9e44c10a682689c2bb321c5d8c209eda56e32 -size 656505 +oid sha256:6cef29f205d11f725304269b3ef752768d5adbcc36324def8777d2edc32d0470 +size 668247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 58c6aa0236..4ae418fb0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd230bdad3906d96c77d9ed17642fe23434976125ea4680b433f60bade7e037f -size 623301 +oid sha256:2263b2d4c744274ad5541c78413be3dd51947cd217ddbb037d2e5b159ad5b61d +size 640765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a090c3ebcf..f29dc2964b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fcab9a5627b784d5ec54ba24a2bd8eda3736ea3f8181a67993ec40cf0316141 -size 648753 +oid sha256:d89246b89796c229bd9bd5dbeb975e76011bf1f7a0c3dda6f40bcb73d3fc8622 +size 661185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cbe5117636..143bf54e0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:385fc2973298eef1e1e838fc351324529633f614d5596ce59d692cc485b61ebb -size 615698 +oid sha256:f4b40c1ad86370be3e04af830088e29c0877e0660155caa076dd2aeb3dbe8c61 +size 635283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4980919cd8..a7247582ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec6f604dd4d6bcf016a6b07e604e8fe543f9479f6acf56777f08f834cdbef004 -size 659889 +oid sha256:9efe37b252c61591c40ca77dea513a79b077e748c7606ca79a3d32d998c7f44d +size 673407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ba1106cf02..daad8f3177 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c07712904fa3f9d301bc9ada10be07b8c3c006c8a682d00fa4cc9878d75afb0d -size 562150 +oid sha256:dafeb2d9c85451f8fcc53cf965dc892789bd6a3b4d54758648211bcf23fd2d8a +size 568366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2a61c4ce14..582787ca6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa5bee3542dfe634b663d39133f486823200812372496969e5e4c0197f29c240 -size 622245 +oid sha256:cbee6e7abca1d8083cba30f748c495695897e60359fa94ea3e202b877182e9ec +size 642275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f7092e7806..13710191fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e83992c6e43277a9af05ae8085b4d055b382aed948c32164cd9f7f1e2fb4ba19 -size 525444 +oid sha256:3f127357eca5fbdebd5ad1e595020160271eb4d122b73eabc517b22588245622 +size 538962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2aec5b2543..c9fc9b4ef3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:776fe324085ed8de9713a4fcde458c3b015d0aff5fa393a76607c7372ba34296 -size 649351 +oid sha256:717cdc5130bf13f48a77f2399d5eefea7738ac36afef44038b5ba01587da4bb5 +size 658577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4e3152475f..c21043166f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffabc175d45610fe392651713c2738ffb0d32c00a6d2aea34cb4a8c541e30ebf -size 616148 +oid sha256:40db2af8e9f054abf97c6a487c675d32be0601529b53b72ebed282f301bfcef0 +size 631837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 99dde0ae57..92479fafb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3da4e3ed524faacbb234229f73504209f858ed706739b727bb769a04cf30152 -size 641601 +oid sha256:2f2be2d4b5fc038d9718997b2d938c3d56677ebb3b568a679903d76f3fbdfdad +size 651517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3de2e33ffe..f4a8ec9b22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb91ccb7165c4544da0952781571489dc68bcf74147b01d3caca4e0540b254a7 -size 608544 +oid sha256:d148c49657ead706709921d2623c640e950f39a8ab4b9f709c1c6014010b6d8b +size 625565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b7728f2091..7e75b603e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2409c20275ebb571090b68fe639b63b1d5c2a17831ef90009e5516ff69f15b7 -size 651947 +oid sha256:cdc730d6a54fc3ef8c5ed5b64512a51a80632d22b66fe9b1d06b4f7d84e5ea59 +size 663687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3297f60fda..630c86b527 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9baad3aeb0e8d3791be84852b19c81b3b5f7b1ebebbc8ba3efcc7a74db35c92c -size 554998 +oid sha256:c957b53de9c4902f3959c1208e3bfa7155a3a00a6e585b475f1ffd98d0361bb8 +size 558698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f2eab9cd11..3e8fff08dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:715c1a61b762e914a73c66575d4aee0088e01d4b2942188bdd76b7d832116b63 -size 615140 +oid sha256:1c46f77f43e0ff2e41cf366b6530e3bfbda4c3698d6371ffa81d33159f4c556a +size 632557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3b09ac475e..cf8f46450f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7a4ecf2dc27e58d8e554d75284595a2fc319b79f8677472cd1cd81a27ecf60b -size 518292 +oid sha256:829c034966fcdad59f0c4b036a1908ff1d1d25bc3dc53050f79491829036373a +size 530082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2a859bb3de..7f6759e2c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d5f582650876e15997f73757f1c8dcc2db9e09853769941cea56d35e62eb581 -size 779959 +oid sha256:7f7bd4ccc8bf46846d09edd40f33de79f5afe290b35ec8b77afa02ac1dbdc04d +size 810595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b85762139f..ae5e4c74be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29fee2054864f7dabe663ffe8fbe776a473fe14f2d927ed4076079c0f863a686 -size 759873 +oid sha256:02977f87f7449d99d34528c65ee2d640a24c5d6dfb804e610513ddf5981c2ccf +size 793913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 641731e5fd..976dc611d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28e485fae85c97ca8248338910e0b0960a096fcd6ddfd1374472ef1c9069d58b -size 730407 +oid sha256:0d1830cfd7dd19a3386b8f141d14926fc5b917202830de124f94e7d3ac48c3fd +size 736871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e6dc8d481a..b0444b8c0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfcb0551c779f170f0863466379882cd563b618621a67ba91e4aa0c63ab9844e -size 648013 +oid sha256:0cce285387ae0a62c1de3810b39a5fb9fd169dd37a92c595b5cb550471861c32 +size 651911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b9f294b5f0..9a30166f19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e10a0eb4506880903104563f541c0ee46fa9950923032e0615b0d7ac90c29e1c -size 772855 +oid sha256:f34761f34bdc99b7756182c423a14c3b3decb0b262c375a35135a834ad43ba2c +size 800925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cbb5275385..3bda33334e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f17d7fd9c908f58e921f8847ced17ad468b24e02e20dd5984273ec6d661a892b -size 752721 +oid sha256:27258ed83d9082c8a89f804c24624b16c527754a7323a7693bfba2334db596ea +size 784245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 1573d6082e..649aaa9f78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffe0c4faf416c8f8d03756a55182ab737a136207ccf7e5cfb7b31b70ee94a739 -size 723255 +oid sha256:076f64da6409852bd99039663374237da4398356409cccce8e090ee64a168cec +size 727103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 50bb59490f..2b6a7d98cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a1a13135fd3de96433aca94077e8962f59251e516813d662835ae34c0f4aff5 -size 640859 +oid sha256:4d348a47ed3d4c47723f9ef75ad8d8a65fffc4dfb53d19cd41a51190bcfcbdfb +size 642241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index db3f64b851..9f0fbaa13a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd46c840787cecb18e09d972a21ab9f17c1bf59933e48715a942528cabf8b771 -size 647691 +oid sha256:a5d4c5e6472d6b49d616a92f68c5d4d611105c29e24a45b8068adc022f144654 +size 662195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4e347e8c3a..596c605659 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bde1f54b99ec21670da3b0d0d2b8d0fe90e478a4d338bb9441b418e87eadb74 -size 592188 +oid sha256:4e9ce867c61ccf9c2e401509417ac5c336c7dfe2627a18d6a1070ddce4dc6808 +size 615376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index afdc22b57e..9c145fa29c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac167833a3adc3421a16558dd7671ac57b7f600fec15ed133bea90e7801c8a9a -size 636881 +oid sha256:8cd96b40a4243dfc1e9a5ba648cf522ed9367107a9fa84808bf18ec438b18e1a +size 652669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f9b5f8dac5..930d4accc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d79ca6fca8528b59982bf4aeed6b1c0b7d574f24bd324492d1896850eaac75b0 -size 588038 +oid sha256:fc2107590974dc68742ce3e51d73f8eea8e53f91bbd96167947e734fba2632d3 +size 609104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 24bab5ef84..5f0b3e4c78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9744b5cacb4307a5519b121b9abe510bf716fb698bde372c81c0aaa3fe795329 -size 633119 +oid sha256:db5e1688879cf9c3dab10f226b1678ce080ab5f990e12741c0134c7097f57d79 +size 646191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4404793be..d133673eea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06d4560fb2373272a3f89421d65057c58a96727dfeed18d9282b0427858f05b6 -size 539574 +oid sha256:9890b141af97a869534ff2682d6580f50071507e6bce15a759e264e8413c07c6 +size 548058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d3ec4907f3..9518d60d6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ef0b351a7eb4e04cf37f8194cea1acb03bc9652387653e5a12587a64c261d34 -size 587532 +oid sha256:cad9388e000d17e16fb4f34a99e4630d176ef3887deabb973e51cbb136cdaafd +size 599272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 12bab64080..2d6657ed38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:336080bc2b9d01e08579f338e7e51b376ed5fa2109f48d9a37197b6f4d39fdca -size 504002 +oid sha256:aee1fa2af26df3cb3bc99c5db574068725b1136b058d89a2b3e1f81ebfe1cae6 +size 519592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6dcdb7ef18..d0a21f0404 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8979dcf44b38e33ee5bd415dfb064b920eb35985f9a24d58255a2d79a6cc48df -size 640539 +oid sha256:3f1f92a916a99407f9673a542ca700dbdfbe81a8418def7c9503575a2c3f98a5 +size 652527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ca539c2378..2295384fb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:941896d14b2fd86b5dfef8f8b5f21e893ca0efd806ad9061877b8ab7baadd057 -size 585036 +oid sha256:006fd50b1779046f482c6035995a6f3fd05d1bf6ecb908f00135d00eeb2fec61 +size 605706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3288d46dce..f7c090f7b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:163d12b584bb7f18609870981fd0715c9f9e0c5b05f4cf41bc48de48e71b9e6d -size 629729 +oid sha256:903bb361b2b6e2f41171834ffad414e39db16d12abe4b0756bdab5bc9b4753a0 +size 643789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0f2c20ec92..4f66811752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:777bb512ff15898bbd1d638e90698e6317a9307d9b305b662a62b84db5d5366e -size 580886 +oid sha256:fd5561fc8cc9414124b4ae250656aa1197b2f80825d52a4ff71020353f070880 +size 600224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1342725c95..289737bb15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c2ab207ba5d708d71c85ae4348e1798be0e26789ce30f2f8a265e5d8a62c8a5 -size 625965 +oid sha256:6e46c5c5808493a932f48b678462f9667f5d9d66c2059d22760719cb3142dc45 +size 636473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c313420bf4..776b3f33f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f0cec0571a5204b17fb46c14eb44ff20217bc56b699b676fe15c24a2455e73e -size 532470 +oid sha256:ba08f987b01ce197f1648a00da3bd641fcc4f8b9e62c845be0c1fe42a56848ef +size 538390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b141025281..95a36c8d31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a01f81e28bc9d5c131bbce3cbd618914a4e271094690ac9a609e929086200d33 -size 580328 +oid sha256:9b074791d9333bcaa531865a9ecae11949d16d6a992879f1db40fbc71cc0ea1b +size 590344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b4dd285ac1..d021f03c20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bec49b63ed553d3e9e45fece3ce773d6f7c5c5393f323206e28461b748db5b89 -size 496848 +oid sha256:cbe4d25ae7907d4e9bef8c1457afebe11269224664aded156042d2f2a99925fe +size 509872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 822f7b98f3..46905c1847 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f25cc062de26bc2d00d8ff2ddbf706cd21c7009f43e812cf59ba50519f7334f -size 667853 +oid sha256:053cb38d6f56106f9edf1546f241a997ce08677f8aa7fe65dde7453d768892fd +size 681371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 00b592958e..6d1f52569e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5c0401c9948540523ed61a4eb3fc218a3945906837c1ec82749996d9dddfa2f -size 600412 +oid sha256:64759758f8a359300adc12c5a6936c10bb3c74876a31f59ecf10b032258ce59c +size 632183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 995e50bdd5..fcc379f28a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09bddfc7407cde32b5d2874064e29266ce096b63bae7bd23a2e23cfb72c5de35 -size 655515 +oid sha256:8f9dc043949fe49d1cd9fd230d4a1989d8aa2adcd11c855f54b201a49169ae9d +size 671055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2140c9b9e4..9595add972 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2937164539df66da64d3f5f3ea2057374b0d7446d58351b489ef44d88b449017 -size 606228 +oid sha256:d6ddb0f6d908236091c095f9c87a7ca41376a3ed872b2f748f72cec8a3190b93 +size 626701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9985cfc8bb..de5c21ed68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e0bd9fdf51f58b7bbf2d473403dc2bceb0d4fc759eb9e999419b5bd77b1a8cc -size 650419 +oid sha256:5171574c7c86046acb14ed9feb981f8a5c3540bde12be46f56c0ac63bd01f2ff +size 664429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4068f014ad..5648974f68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cac1bdf6203634575214b116526eb95202d3d3b6aae73fd0f6b9282242b4a493 -size 552828 +oid sha256:4d92248bf460003e59f8c6398f1491b4deb816fd1000abc93c9fbb652bca9f38 +size 559242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 170ae86d4b..5fd3d2fce7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b5628708d366294266e0d3671e43dde8101006303ba34a4bf23b7c2ff3c50ae -size 610802 +oid sha256:8f5c60e0d92d07fcf352e7274181ee9941ea0aae9962fe0afefa0cd500e23398 +size 632213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 15f7ee3c47..819ac24a2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7de1845049533cf30dfb96c22c9efa2b650e93a818cdc83deb5001d94acd45cd -size 516468 +oid sha256:4ac557fafb8e5404f471d394c1d8a191b839e7ef59b3a4e78b1746f2007879ab +size 530034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index da30074c2b..34e8683836 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17311478ebc24f35a52800cc4249bfb892c30c28dc910df62b70a5a6bbabb9c0 -size 660701 +oid sha256:d278d999a15a227c7c2ccffb7e745b5428581416fea089f74ff23cfb10f83d7a +size 671701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f35711a2f1..c587c7ed82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8f7244a24eded74f3bb75c5e278681c6e926ab8a866116d9257c64289018d5c -size 593258 +oid sha256:d9b9572bc0654d7e1b07f32ef3fd9d366cf8bdc32c7aaa9250d7f9961a0c8606 +size 622515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ab097792c2..16585be087 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27ae90f02d6047148fdda0f85d805ebad4553bfbb5727a948f5a4658ebda2f6a -size 648361 +oid sha256:469579661cd1a88b3dd6a31389c51e1402e887cf842563e1ade3e2c18b59cab0 +size 661385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d8fe3b9cd1..9a29782393 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00041d26f664ca34b5957341f92d5e7f0a0cbbfcff7715c2818bb2b3cd72f26c -size 599074 +oid sha256:f96ee33f8ef953b86a32643bcbea57dab5ca395b0abea91c9a785b28b7847f3a +size 617032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index db07a46684..0da781016b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a69e72ac6fcec06562d15d362b37c61ae482f3fe24fdcf7be16c31f699d8a141 -size 643265 +oid sha256:b76f0fd70688cc24937146866130dbdcfe7fbb6adf1ec48337b02b1d5e7eb19a +size 654661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3811ddfa3..ecdcab62a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73aba360bc50466a45e7129f75064fca2361cae29aaa708706af4702349f68e2 -size 545676 +oid sha256:5b4f3a26a6d0466ba6f148c6995c523405bb4041522fa51a475c97b33ac0cd8e +size 550362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b4a723b4d0..7ea2eebcca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76ed01cf78ce7b03373cbe4a7521b61f16ac161bb84a4b838af03994fd06ac58 -size 603648 +oid sha256:93896092f9d0cc18e9192483aa58900c4ecd39389e316bcd7c8e6aebe75006f4 +size 622445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e484c19150..5b9a563ac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b6b7854e060dc07d765dd9e03aa8cda15703bbb0ca782387866b1256e00f0f0 -size 509314 +oid sha256:866c51c1aab8c01badc2212f0eb8e45f6a5123c431be24edc57213e3ffb5ef08 +size 521154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ac2e6e784..4c1dc30ea9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0beb6fc881a2fd5149c7a12524e7691afbe08b7594b9df42ccb4d4a353ec3187 -size 746211 +oid sha256:c52f1b941b345e1d08b58ef0e0fc6ed5181f300165c7544e94e72e2b520efd8f +size 778179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6e2aa456d9..4c9893994e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dffa6066833856c2c0c0b19e09c2dc59781cb074dc2d38b831f5b31bfd8aed31 -size 696135 +oid sha256:3b7860755f579d251a734ea47503b36e789b024ce19f33219dcb35e6208cfeb3 +size 736391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7a0916a03c..6bc2e4fccc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b705369842f01410f441681c2ec603fcec5303647ea6fe1b2550ba12f62c857f -size 730417 +oid sha256:7d5d7e94447a9229365c2ed7daf40f70be03945e5fc913e8534cbde249f35a70 +size 763717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 272e9931b6..6b73d7fe8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a99ee965f3cfbd44f177723ac2785017e340cd6ccbcac0c6ed9fd17b6e0a01b -size 686065 +oid sha256:8f5c65353cc930ceff60c503603e8f2e1dce4342bb3666ffa6ab56379a10ea76 +size 727259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a5a69abd3a..edd84c7cf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0241be1cc86b081ce2956e6ffedd48f52190de98ea8962945f966c7d168d671c -size 729515 +oid sha256:b7b12f5abb7a1417a52da41c4d98d5f0631b4ac3546bdf164653545a87103de0 +size 761139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0f57266273..d8411cbce5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:792a28fe37399884396ed750dc7bc822cacc140503785903c8138e0a3a782754 -size 636515 +oid sha256:428a6cb6744d0f66637846c8a24c5a5f1aecebb9086db7ff2f17646283fc24fa +size 656051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0922f979b9..ff52a615f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:399c1e157ccd9a6271294ae72f5246cad058ac862c52ad09bb981b317267666a -size 680033 +oid sha256:e9722d482409957c083ec39dfab3a624ed5b00d29dd0312f28d5051d68212b68 +size 705981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ec898b597..5794f29eaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13dd9b797e9b9e215ad58b237850001b101fc2c94ca54cbfeab98991d54d8c1a -size 597044 +oid sha256:9cf3bd69bb1fe8c8c5b431d194bbce6a50cb04a1cd752dc9c0722ba4c2127c0f +size 624377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0468193abc..2050b5d27f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c7927563d9cebaf47e9cc279095b69bc1db71fef6fa861e60afefa578954511 -size 732693 +oid sha256:36bbfc8061349fa29f02eecfb547b98d3ccf121cab8b6767ca4caaa561786e05 +size 758001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a032a0aa32..2fa3dc3f89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a0ee92c254a4518dbf1eb875cf7fbf5c39d64811b86f3262159b33a6744f0de -size 681877 +oid sha256:85dcf6550af219264c09cbd512bfb50ed0bd1df5b5ffd5fd2f05b1656d571be4 +size 716263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index de71c8ce23..4c6187a9cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86baf1d9bb0eacced514c1da21582272b23431a0d50181a8a0da130b4886d0c3 -size 716901 +oid sha256:a29da1eadd3e4060b47dce6bd790b34e32a2f8f0f6e6776d1d03f22ffc480ea8 +size 744379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f28af2eb82..de3ef9dc5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6bc9ba62b8cc16d51d2fcc0ab6c9d38e9b487a2f49b3b3918e45460088983b2 -size 672547 +oid sha256:4a8d33282ba09678773a3f49b141f98d2c1dbc7e7c168d61160a9e4b5f527c6d +size 707919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 48ac88d4e2..fb54515ab2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffcd73be21ddf5bbb78396109c62d99dd436328450041c07409e262692967396 -size 715999 +oid sha256:b3d57cc34109148c1258572f872dfe07347d892dea4542238d2c517e71d1418c +size 741701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c1081c1b21..914aa3fb2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79e038f2a3df4799e171c8485661b997e3e4a8696da62b6ea4bc99696916a3d1 -size 622997 +oid sha256:03bc99c398ae7d36d1770cc3b96bc66212f43f51b9c5a28975e17c5e7d125b92 +size 636663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ebf3486ca5..982bb25f6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:476f26e27901f7bb8d294a3960c334611f64516d5c9f4953e3fa9fe1c1378ef8 -size 666663 +oid sha256:fab0558f0d9d8a737b6817cbcbb69569e0342a5b35b04f1c9d6401986008ae4c +size 685755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 671b4e21b2..d72cd1be85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4f0ad2d516ac4a31ad4d993adb7d420a4c346fb61bc1aae60665e0b0ba5e1f3 -size 583576 +oid sha256:63370f13d6d303919f24c89782a31754463aaa329857fb3e9ea060ce62a69195 +size 605776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 90d829fc46..9f57cc00b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb2375beabb0a6c1af2fd0ab6a4eb98e4adf8c44552383443a2ddfc6acab3364 -size 767359 +oid sha256:bbaeca1ebf9b52187fa0c336162698af2cc96964a3b1f2b63f0e4f7026333fa2 +size 797353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b0ec89d3c4..cfe94f0368 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31227408e0ec22b1b97e3e8bdea0242c611515547ced30533891f429ef9f5a0f -size 715113 +oid sha256:14dfd023472188bb1f7012364ed25c0b16249fc9bf475e916727d1f69720f149 +size 754037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 26af567770..5a5dfdd1af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed0e8d2ea32cfe84e271ae6200431cbfc698ca26c443d3c53dba8a594f835f06 -size 749987 +oid sha256:2559f7a5296df9157d1cc0142b1e7bdfd67cd6a2636c0bdf07170b8b93630632 +size 782893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 51e27570e8..76e3d8708b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0092c609e6e525a72639b3ac8969dbdf9823a1014e7f6a1bfcf6b3db57485337 -size 704205 +oid sha256:ddfc333ce1d98e511e7bafb6c8bb33915ba8c8d7de985e0dee37021fc53e5880 +size 744855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6f30b3e6db..976519497c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0649dab7d85eceb65edb1ae76f3540a776dad2891d5c246aa6bd70c8312d4809 -size 746767 +oid sha256:6515fc42d1571969b956d4f4a41c1242a2f1b7ba66717543a8dbe67e6e7839e7 +size 779969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4807daa6cc..e4d6bd5664 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:179a6bda5f00d0d1b8403de1a5ac38813d4871207da6ce5ee92925a5e221b8a5 -size 645577 +oid sha256:aeb37461b6d8875bde07be9a0b48afa3c62ba6be53c644055177bb928f5c147d +size 665113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8cb3da25d7..40d913d425 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:722a07d174eb689e5c961bdd6c694cfaa776a75b31b13408fb65a9f617536464 -size 703697 +oid sha256:67a831863d6d76690c7d1c6d81c0a114b3785389f41cfa94bdb26aa6a249028d +size 742227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c15828d006..a622a87ebc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41ecf995d437a23759f2a17b9185e1a26174be7674255b95e1d7fbcf2334a686 -size 605318 +oid sha256:8a01ac272bb15d120af500644f4e6d2013201ec247074376b0ef847cb2a6b134 +size 632599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5dcf12791f..89f9451954 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3738cbbcc5854e68a6f950ac11a1650a92ed73ebe017eb7dc48a4002451c6515 -size 753053 +oid sha256:401b8b126c3c8f438fdfe19ac40eda43ef04ed8a788a3e5ccbb11861cdd906e2 +size 777225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bb1d8c91bc..8d1f0455fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4df151c12e77c9e3daf1f89ab5ab15706eb9e4b96e15d06aa127325461725a38 -size 701595 +oid sha256:2dadccf08bd498833f0a259e59049edcee0b5b21000ab49943cce83ee029d7da +size 733859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 28eecdbf37..ef02de7d95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee83f142f0940711ba7467c64897a3be0f455e5229d6ddfe9437f3aa519932f3 -size 736471 +oid sha256:6a77cba76b907a8b82894f12733ae7e4aa75caaeec445d38fd31daec8c98bb78 +size 762765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7baf0891c6..e737110129 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8a1e74e84a4176e2676608d17edfd9b448fa8d3258d31f31965c8dc82b6da1e -size 690737 +oid sha256:d101629a6a3c96704c95ecfb3b5137c99750f22c380f05d1490ca383143b4d5f +size 724727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index aa3709f3f7..c779d21444 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7152b725433c96a5858856c475440c06a1c978465037cf09ed69ec15df50f8d -size 733299 +oid sha256:0ee3357db3013774687587855c0c84c0e9fcf8f7f7275072dd11850608c5d5d3 +size 759741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 849da7b4d8..205fc149af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c1df12a1415b8c10a9dc71cac7b42ee5e6e193865f1446d12c0ef0e0b349033 -size 632059 +oid sha256:972d12c88b0a733dc5f965a228610018fe9eec628853c4bdbe62026b3d5238aa +size 645725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2ce26928e3..1b745b14d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8949da078594c856a93d5a73254fbcba7724d821af487e26afd44e6cf220685 -size 690229 +oid sha256:2248f050185d343462573edb7f22976fc36b506057910fd2df8c3e68416db986 +size 722739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 097a6fb3bf..ca5c7e5cd7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1a80f0c7331224a7070474c73dcc3d011a9f243081d70d7b30ccf608e06cb7f -size 591850 +oid sha256:856f865699610341b9b1239cf1901e58c639518907761f3dc503f09a778cc398 +size 614050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a2a05d2dad..448dc06b5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ea96d0a805945e8305b8304e135773915d5666851699bbeca14ebee19033e5c -size 704377 +oid sha256:a86b87cd694dda5473c8cd6a3f9b7ba19f67334a7bc37310b67910f5a36734f2 +size 740885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 95014c6e01..322446c7ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4fc8066b5d92ae05ba3c9217916f9ab923304c1700a6b73fddbd8ae9f8b8afc -size 691791 +oid sha256:11512f53a0240fe319a170dab8448d26f745a4f5c15bc4af7ee9eac2553735ae +size 720553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d85c82d0cd..45f9b7cef4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a588d44a8325e532ecc503ae52b278bd0cc4ee9e2a92efb6ef530356495a1d27 -size 680381 +oid sha256:f2df21ed90fbaa773a2d400a5b3e0d0c78765998c8984c36aca112c9f06e00c3 +size 686105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d85a8f92ad..f4e8940c03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d77df5d4bfba357f417c1694046a263ed79bd6fe201583dd028c4e856dc113c6 -size 589204 +oid sha256:48558dcf86d84cccc728011767ebc43a2f9210c245d35fab5c93e7bedca8e20d +size 601932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6241d55d6f..9771e89ff0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9479b4ac9c2765885fad879a63d1375baa306c7c2a28889fa96bde22adb6c01 -size 697225 +oid sha256:922d9d511696849dc84b4c82961282b2969d490cab5c505e3685cfa7429fd985 +size 731165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 786b816c8e..6819e8503e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5026a3cb40e17c13506e40295511bf3b09323fad3b143e89ab855a5698e604d -size 684639 +oid sha256:d3a72036d61d88fe7704e7c2269582fc77e5e11ddde00131fa9a0723d4c0ac56 +size 711673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 33abcea5e7..a9e6f7154c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a621faedec5a7f2b102073e11acb99d1748c0c12c86c71f145dd890708353031 -size 674017 +oid sha256:9daca004484c86a401811bed8722f0377e34e77ecb0c80749bc41e7a274b3edd +size 676385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d8419b8650..6e71667f57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c02e6c3463480c7154f26d349fbe3c0eece29493fc2212952eebd46e7c6288dc -size 582052 +oid sha256:68d7396e5fd22d1fc9224068425d8d8a3773914f6ee2136867e36c59878ae2cc +size 593052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 57e417c10d..374edc1e36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eed3965e632c5b01f391f713c7872125ddaf8b11ebf6e4bf2b9f7c7f3d2bcd4e -size 625391 +oid sha256:7e9fd3fd471ff5b4f2456ba9e3b1feb5c7c7bfdb34e7fb0612e4cd013a555c68 +size 640339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e4c2261822..2544561b06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42b83878af54454754242abb30a8b2f56e1d3f8474f575425abdc417a2f9e41b -size 592088 +oid sha256:615fce75e9782edffc0a9c6a475a9710793af3078121db3c1cd1311bce79abcc +size 614386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 85ea736985..8dac3a35d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4edf977d3a4b6dfa2d64d1408c5f9dfbfc1aecdef212d5a6e91035c2fe1023d -size 619021 +oid sha256:3fda4575571b192af9de4a18ec3bf1f3d4ad4a401fd94a53e2384102bdbb90c9 +size 633229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b04b9c058d..6ca42c0376 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b8cab1adbbd5c20b62fb81136abe2289d9f49240680c3a02680d89b88f57cd2 -size 587346 +oid sha256:56743bc582a132e9ed8b3f34858cc1e15a0dc1ba2fd96e0039dc10b172367497 +size 608116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f2b4b65f9a..5e0077658d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1503e65a608938a28df30c723daa6ddedff81be5334d70872f85e3d80c34a6f -size 623397 +oid sha256:08a69adfedf539fceffadf8c8cf42fd4dcc58d168737a5e185f155f46bad9fca +size 626901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 32b14ec61f..1f3b7e6476 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06dd7ab370a98435e8302a374242dd85989061d78cb36a6c7d3c94abcbc0a819 -size 539916 +oid sha256:030ffb0aed84a56ed1338be8c8cca0d708b306f7866464203d8f679b357dcc04 +size 545984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d478ddbbf5..0e4be915eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fdaaa427568b4299a408637a08fd6dea0e09eeff3461939e9ff3a1322b3d566 -size 587826 +oid sha256:f9e0fa0cfe0a19666e5945024c4bb3107c2b0092d9445067a8e3f4a274f86088 +size 598334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c9b17fd9cb..a39b2f4fb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16d181fb143937562f3c54f0a724afb182399ae62a0cec4ddb189bfb69634b21 -size 504790 +oid sha256:07865be4d397cbe048e8d98b23ebaa2be9f3e45fa8a25fce95fcc917f8ccc16d +size 518750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06758068fc..81fc92322f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfafc0feaae806aa06cc268d0d7f2e6c924d745db4538768e7bacc873f87cea6 -size 618237 +oid sha256:703993a4d180d9ccc5e6eb63441d53953dcee3523243c3c2b0d03387b7bbb19b +size 630621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7da72f5c12..fefe3a3473 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09f8738bd8298857424dec4ba69b276728550ea30d65e891eadf405854b2549d -size 584934 +oid sha256:a1076b51571fbcbfad56558c0d19c91c8d92878f0df9a989c09607d206c298c3 +size 604718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b9551bae1..06e0f54442 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee582858760ced4c06d25006bd320edaae86b6221744a9c38bf7a68621a35205 -size 611866 +oid sha256:41f5c16a432c2f396ab1442b782cedfdb9724215f52a0995e8e5159495aa1655 +size 623559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e341f07148..5f66bf6fc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22ca1155f4c7ba7e6af489bda3e0702da660e2201ea2cdd1c4fad5e7c0a97ddb -size 580192 +oid sha256:516c7fc9482b1655f53160c4de89e609abaea00b47b934f893415fdc834d9776 +size 598446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 219dd396b2..a373ff97ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70d9477ee26b35d57c176c646bf145a792cadb8e520e46839dea9c6cd1454781 -size 616342 +oid sha256:61a38cd5868701887884898433e3d1f59c683abd7ba5187beeb62f304cc92e1d +size 617082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 78b9646785..e196f38678 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7727e805b5aa9ccbef2e826ecdc0c290c229ab3981a81082fd20ab00e499e4fc -size 531974 +oid sha256:3cfdf9c2d645db76d16394df8df583dfed9ed28ff32247f1d42b34a43b78ad60 +size 537104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 01dd9cecdf..4ebfe6afff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c9122f4122f294db2fb2c34dcc8b5e18b5c2f3653383fe539135687fb2daef0 -size 580770 +oid sha256:5b62006b9a0bcb8f2ab5b8b3d01d780c6fd42323b2f17936072aa67fb5d79629 +size 588714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d67992d77c..58b5a12c8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c1be1fbfa944187fbd153671d1e701c1e055d4308e417175ee3bab4c87b082a -size 496846 +oid sha256:dea4b22319bb25a0e826a113428115caaa1d07c21d48eb3bd87a4d29d5c6c679 +size 509082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a6cf17787b..331e786f22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39d952d9f5aea1e16a84605e8491ea78f055d094d6e33fe5c3edd208178fc0ef -size 646391 +oid sha256:22185fc2cd6802c01f6adb70f4b5bc556aff7d073c5fd580c7e1a4b3643ddbfa +size 658725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 104040dc4d..e7a0a413c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:002d1ee299798428ace3653b7195036b30f06c49d3e19b1de3231aa900fe8497 -size 601200 +oid sha256:c743832f1f1fba6fc5ebf1dbe83192d35e01c315352e0ae427ae7aaa99b11f9f +size 631195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7348931ab8..0ac3c6ff1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4e0456a7ce459538c58632f5a9b493615cf88a18b1ce0b497faef75d7f595f5 -size 637653 +oid sha256:8472ad79550bbb7f393dd90500c8a0441c9c6347bb153ad39b56bc2afaa836f9 +size 651665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a9f5bd61d6..bfb4de2a64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a30d4fa26ffce14e37bf7a81999a2182f39d1b8c20b1014b8a7c4b1892796d0 -size 605338 +oid sha256:e2aa3ae461e0a1f50d65b1d459ea90b9ea1cc22fccda109065399a7447990f4c +size 624923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8705493eca..7afcd29f90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59045342ea81e85a8e5ecb3e12886ed1769daf16e2bfdc9668e47f2dbcd9aa00 -size 650121 +oid sha256:bc6b9c99152b15ab47be323fdcb7cbc6918a7c1cc90276f94c50ce890ac70939 +size 662455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0962eacdce..aff1c9c449 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27dced71f953ce5c2fffc6753f6e6a3510af4cee6c9617dd97f0f2fd234df297 -size 552432 +oid sha256:803654573cca2f43c1d6379199e0e9d92a946b35dd780bcd0838e194a1954b7b +size 558056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1003d8e18d..3cfb70666d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac39eca53e6a31a1370ccbeb536b13a2dd39ee840876b63fd80cf21d41d53aed -size 609220 +oid sha256:c19b67a3e3650343c85d816fcea81da7d27eae23c4fcefc767db000993e8820a +size 626833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2909045645..295a2d6e64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ee4be833cedef6beec673eb3d9547568768835550fe4568b81a6b4584df32b7 -size 515726 +oid sha256:ca6b1176ea11c941e23b5b7f8c1ff315d8c7415bb47057febbb05b8fe6c776c0 +size 528454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 45bfe3d31d..75b95d7295 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e74387707521c818db33599e4e37a115cae34ff2a430f8132712f2cfd0d91c9 -size 639239 +oid sha256:b641a0111d0df927f5e456428301c7483094d9359c94de6025f352658ce97b5b +size 649007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c385f3816a..81b2be38e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63d351acbd29261b52a71aef646f1f917a88802781fe503bdee5fc8e7f45059f -size 594836 +oid sha256:ed06ff92c9e40f32b00d16147b6754d123b66d9855d71f568ce17ee2fc8d7ff1 +size 621525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ab0367cd1..262c916512 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89b3e80b3a9700e7176b9e60743d755f2c738a2bf3fadda0836f43e9a663fa31 -size 630501 +oid sha256:76684b0b23b5d11c22639cfa4b57b1a53fceb0f7f61757431db80a620d9ff8f8 +size 641945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b21e68b710..5df3a83816 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdcaa15f37bb06e6b2986cb3f99aca0f1e7835506a77ec004db01d7c991d2503 -size 598184 +oid sha256:13a49c6c3f786eefb0d9b2e3a9c3fbc93b6528d1a85b3bb19e1ca071ed1024d6 +size 615254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8928b9b323..2ea3b6b890 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e46596fe553752a2568a48f663904a430a68210b238271086390797985b21561 -size 642967 +oid sha256:799bcbfe48fa28c0f463f29ee1260bc319f27045c594865a583d6ca8e86fc32a +size 652687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1a328b30a6..6ea636b4e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22ed5d550085e1f1f7ce163031731281fd635374a01e8a62efb6d13e346873b2 -size 545278 +oid sha256:71c6c460d62e223dbc23ec950714eb7967d9918edc212759eeff83d802ca5969 +size 548386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 85aab5b6ee..969ad7477d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac0b2576b154be0cdf16a3af174b1a911a692253fc84f91833a883d44f7ec793 -size 602068 +oid sha256:3b25e40fec3d384fd7fc34b7f2f0d78ace4857a6ad3dbd18dbc7008de3539944 +size 617064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 43b52d4d6a..39a74cfe26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a1d64c9dc8ee51f73167669643664c4eb4f16e4fefce46db1850044acd46dfc -size 508572 +oid sha256:bc5e6e5e6f3d70cf7d7748fc72e9948a4d83fb99080626e0cec3fe33bd20216a +size 519524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1ba5b5c507..7748fa8ddb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76ef7eef679a404edc926a8edc5d14deb8076970b32a7079ba40d3fbe6b1ca10 -size 779859 +oid sha256:8d5e6b1fd28613a75f23716d59a5dfea194fdf096b145c98a29cbc0249c4f8aa +size 810495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2a60af9784..2e9831a901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fab111edc551f72aa889bf656975e95684fbfde94dc0a90306c0ec76d31b17d -size 760565 +oid sha256:f38bf4c91a879b004640656465b9b44c4e8722b4c0ef1f302e4be2742c250d66 +size 794605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e2799b56e3..783f533e0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0aa049a6bdd7b7b4db5df183c8a35a207b029bc9f21dab0365f85469fea10e9 -size 738745 +oid sha256:f4be93b8ac5e9906f2598520ba62f25ab663bf6756877dca2d71ecc5abd885f2 +size 743431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cfefe354d4..ea5c19a459 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cc61bb057ffe94edb524a19ef05d86e27700e6c6e36c4889c6df8f4262dac11 -size 657633 +oid sha256:c782549fd4fd5fb5591b3c1b54d5b11543e0007de3af03a290b0c3368c638177 +size 659261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 42d8ac3ee9..42ce4c2f45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc2653f023055e31f46263716bb9c5e2f3150ad94c1881c9d5635829d3c404cc -size 772755 +oid sha256:3cfaf0b9d419db15979a1735e52e8497bfa27c42eac53bb29d682c447c73a078 +size 800827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 748fda9811..d593f54207 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cf1916fa26051e8e348ef9856ce43783c2b18b2459b0989f18084184d40e8d9 -size 753461 +oid sha256:c2c37c79dbfca995fdc451cb0e64c015c5d1c2752ecea248fb4b320fd50898bb +size 784935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 50966a1808..291bdf989a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60c28a7d21ee6bb5892d45dd578d7c646df6e4001639116dd8420b42687eaa32 -size 731591 +oid sha256:f8eee507953ead5da042a96b2f3da6f4be59271cddb537077ccf1725b78fa75b +size 733713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4955a9a7f8..eb1bfc876c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:928638384ce82d2a162541f631e7c577a29fc14574984e5c67671f7173b37a3a -size 650479 +oid sha256:734fe0b3fc8c92e82a70aa72c567270fad925fa44717ef086b256e7777a460fb +size 650381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dec4235452..3fbdf89d56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f862c0940c0dab9f074b594b73710dbc89d3016eb91fdd05edbfa00ca17ea32 -size 648383 +oid sha256:4f124b3740b1b1ba1f31a7344ea9881e7cf4d882cb676997b9456520d8f1c342 +size 662097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7e0f4841b7..9df00eeeb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53ec29099db23b62e98b849072089b370274a70090543cc581792ab676707b7d -size 592880 +oid sha256:322a48627ec1ea12838863a348874106dd0e073ae93c4213efe9d2e6ae279157 +size 615276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6beecb8af3..2d26f09859 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53e7f6a921c458e6152b0239b77a301b1a8896b6fe8455e3270b49769db49d5d -size 636783 +oid sha256:c90129aca0dc8457ac769cfc43312560a3f0f332924aab6c8743e80679f26121 +size 653359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9a22092078..70d102d685 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d8eac42be38651356696a6861f856a9a0c324112a9211933984bf3b7afd13ce -size 588730 +oid sha256:77c634a2021af2f6721e6d4f7d5a8ae61d6e4feca031c52a62cf08313a98240e +size 609794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 58a8e73de4..b564113618 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd9c78356d8a90b8c232fcb69b97501629710df2f2cd9541583314380131e282 -size 641603 +oid sha256:c3ba65ae063e8fc1bf4b0b362f411f54cf6cc7319de59dd09b0aad554bba639d +size 655417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index acf94a3107..bc5aac5fa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77f588f65a8001f882ace6c9f565d923c57300dba4f2376df2f6ac06a93a3610 -size 547664 +oid sha256:a49367cf40a3e87f351c697a6c10b69d9ce878078d522cfe692d43b01757d367 +size 555952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fd695f5e48..0e4cc9c43f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:970a8b575ebed57e4c8966de94c8a82fb428951f7da8a4c0489ed0fd39c5365c -size 587580 +oid sha256:8a4c6f42c6a24becd6a995dab32eace0c6b118d42226cbc7c420771e1e838a3a +size 599618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e18652c4f8..571b05c7cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1ef76181585dbef96224d114fce456b030c8fcd7ba6ffee7c95dc2526eb7df0 -size 505482 +oid sha256:46af9aca35dd5b65654ee879499af6a5f96777f5912e83ea60c530aa5d473bb9 +size 521466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9e6037fdba..a9290b5b67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:533001cc3cc2ba57926bf5cf9b05d8ffcfe789b7f5a123b372ac43295c69b284 -size 641229 +oid sha256:63e7a39f9287bb57eb90d82bf03287818e0c6875c516fe04d3b96fdd0c41ac45 +size 653217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e5c601e971..703cab24e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:062c83db8b1652db7b07fc74242c6cdda8e4fc7e9ff68ae8cce06d8196c7f164 -size 584936 +oid sha256:fd2132a172415ee628cf0b15a820fc7dcdf99a0332f2cab047b3c8d9290e8ad0 +size 605608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fbf873958b..3c0c871020 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f42611edcc8f2aae9f7716faada5ffbfc4df51f80f6d92c74608b45edea04e7 -size 629629 +oid sha256:a718a958006d45740eee29fe74907262efb789b7a92174f908a469aebd0777f7 +size 643689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7d6490944d..5604dc2202 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d68db6629602059d0c6aaafa47dcb5d3662ae37a83ecaeec5b96a507f3c31208 -size 581576 +oid sha256:30f2ded189684991ec885c1492c57f0f176d34d8d1baac4531bc9e740cee6c95 +size 600126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ec938873d6..f44bff1e4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f7fefa1f4cc02192eaf2f5aed598df404f57fc4f98ae051dd01668d60f99fb6 -size 634451 +oid sha256:73571674917dadb04860a4649ce9458932f90ca74ab690f9c75570fa92e8bb8a +size 645699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d4a617402..e3fc84fe08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3e9a1a2a97eb2d777dca06d7969351190d3bd94a9e63191176d47cadc4776a1 -size 540510 +oid sha256:a8d1207a89cfc333a2ffe3f9c9f3bff2635f44f81a4b36eab9c7900343a6fb9a +size 546282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1cd69ac96f..1db9336b73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93d53aaa4e4f4f1ce285a010a00d0f1b6e5977f057739aefc4f05a1addc9a4f9 -size 579540 +oid sha256:403fbad9f7b898d835bee520055c369032659c675ab939e3aa8ad5f963a10b4b +size 590688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d620bbcc92..27d91e87ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02e9a0f04f83103b1ea0b0433b8c1299b2ce7e097aa2c8ef84de669e5a478ae5 -size 498378 +oid sha256:fe0e72f3cd34d21cc67c6aff9b8f7db1352f40b6622ea6797c1e2b00d93145ed +size 511796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 63beac3fdf..f75f8de735 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbb2f66c249c791ab47bdecfcf9b95649873ebb813aa051f532605092d969e1b -size 667755 +oid sha256:85420cad1a3aec121bddbc3bdbfce38adc824c5a0db7ada34ac92b9920378ada +size 681273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e365c55ca3..429db93b6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:908dd368eec41d73cef233b7bfdcf011211c5c082eff6f4e1bf06e47449dc1f8 -size 601102 +oid sha256:5ef5089b8d0e137300c7a1011d24ee841193fefca4db83935e2ebc7eded7a913 +size 632875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 82f74fa994..a850f3a47e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f004030f23a291321d886350218421859bacdea71b40b5525b43bb4a5d4665c -size 656205 +oid sha256:45c25114c2bb575c7003b7d56ed419c98a4d9a8edbef626318fd471d4e035ece +size 671745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e156a432f2..9b2a0503bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aec90a02665a9de8c01109f0534e75c7725854ac663eb4cdc2c54946995bf1f6 -size 606128 +oid sha256:3717937becfe1ba6319d8d98d854edd7e5c16b44ea20cea2d450ad0b8df0f678 +size 626603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index be3ea0d4fe..bc8793e23c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a5ffb5710d59134484d221d0862fac38204aa033a90c5cdc267abe7ebd44f7f -size 658905 +oid sha256:b4e7c80f1072cdaae554e7d906fc9d1956e1c99c76e41cd4ed0c637bade29ebe +size 673655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b3ac130163..59bcb5f0c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acea6f68a163fd61968202440cd32c122df2cc5b1a50bffb7514f790995522e7 -size 560920 +oid sha256:33a4e0ae6c622d644a09b73d08416b5a3ab545f1504f424bcfe372964a57167e +size 567234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8e83bec723..a382f022e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98b7bf787a6d9d279d4e4e4361857dd36e4ab95eeadf424d9dac50dfb82108cf -size 612330 +oid sha256:2ec2efc0a3dd509ab6e0b72721e893d8401a3a0401bbbca0f6caad2f281aea33 +size 632509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c47fe22ff3..08c7459846 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:534774568517f0c1567c90ef98724118933aca87bd1fe0294702c479f989a3aa -size 517948 +oid sha256:7c21e6d90257099e145f18209dc2bddf64eb2b1d2da13a1c92395b1f94c1740d +size 531958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 64ca4c0b4a..51a6edf540 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56faf188a6ea4c749b9e184d07e30935451fb5c84ec5647511a4c20974b251d3 -size 660601 +oid sha256:2cbdb9f75840c644f54c90c4929c40faadaf745fee5436b3fa761f59a437241e +size 671603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c501a14f90..97ed484bb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8aa7579c8afc69edd75ffac0895e6180870da641f3d6bc9afcde89abdf652c2c -size 593950 +oid sha256:2a52e9a59516ca2dea80fdd37d137d4a7a0568beeec809dfa6d1e6cab2108683 +size 623205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ccb3f0562a..2f1dd09229 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:893c07da00f363641ada1108d353b6beafce9d4caa20788cb6b1e07bf4a43f6d -size 649051 +oid sha256:80f946b80d997566d7df66d1836cc2216565eca67993f72d4b2e544fedc6d640 +size 662075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 461b7e5d9c..07a2f5a257 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:018ab4ad934677f9ab82b4fd12400e41f94f157f521cc3073be3ebcacbd4e791 -size 598976 +oid sha256:2cf3c3e70b42300a7df8df18fdddd437c3efb2b1ea5697cf5337a82df96d452c +size 617722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e2515641e4..a3347baf95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56525a077ad234abdfdb61c2fedd5e681c9818eeef1a879fb2be3d79f5c7a242 -size 651751 +oid sha256:21a02440887666e20562a25b78a6f85adbc032c32b5d1ac58aa240510838e4a0 +size 663937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 90cc48f4b8..19851da542 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a0534ddea17f33d39695299dc0d31795b0601ca07d7fe1bdac4e951f4b0189b -size 553766 +oid sha256:0665603728aa5ec3da770945ce229cd13a3eae1153ac280bc70273d46fa4a6cf +size 558354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e2416c3a89..4cda7a117c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:175900a93ca8efc274fa98e0b2f045a742b9874d262f828be1f47646cbbff8e5 -size 605178 +oid sha256:22f3323b469ff002493b12a5f682b5aa392f8f893167855e6700eaeec2069448 +size 622741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dadad9f720..1e934d3f39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e67a72543c62de3b56529a6f68d223b5f8e6862d000d0705569110a88fd540d2 -size 510794 +oid sha256:4353724d8ed3530262bfbf6c6a257433a3937ab08b68570e46e9218ea50ce2ef +size 522290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 177f0d8e78..3a895c9335 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a8a3498b8b632c41289e8453bb7d50a3d60d4c97d1a72c828c70550703d5ffa -size 746901 +oid sha256:1fd812693833d3951cf2addfdd2c883bb80d43209e03641c8d5b0495a15642af +size 778079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 51bbad8a10..0c603bf575 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fe9608498be4688b41cad5fad35cfe84de25292e749cd3532ddc861dc31fa56 -size 696037 +oid sha256:c92d8d35827202de8b7da38c5337f298ad2cb9f7e2c4bc5b7d9e84464cffe255 +size 736341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2523b92597..6b76839f58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:387647d4517d3808db07ea7e4ee13568aa4fea7af7bfd0e838da06f256b23427 -size 730319 +oid sha256:9d176a3c1ef27db534932efea8f55220cec0497589a79a2d1f21b0fdbb84445a +size 764409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1f8098190b..0b7c552f97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:289b10ef33d9dd00713be33fc788159d256a88c9f9ec8e07416bda886517bd62 -size 686755 +oid sha256:8acd2cdf8050902f01f7d705ffe215b7063736b93a0915f0555d9badc2acd6e0 +size 727159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b980a2f461..de426966e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58cefbe300c9b312194867d540193e84dfb987b725496f1af463172caba28081 -size 739875 +oid sha256:2d61520cab5a8abcfab46d503ea978e5c287f1e907f2fb2746ddc7ed8eb1ccb4 +size 773373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f916b62e21..0eb846c0c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d20c89c16da67d638c294edd7ceee3531f13bc8a64d2e2f92d6fdc836fee455 -size 643915 +oid sha256:ce3aa93b6f7c1791a0d00d29083c5557262d04bb6f11566366dccaa94fbcaac6 +size 665127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4f0c7ba864..84da443582 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97fd32ebffde99ee39aeb7c6664c715e655f83c7254f71b955975a5a83211bb6 -size 680477 +oid sha256:c7ea77b375d604a920eb166dce4eeb49507833d12eceb6743054f67447db48c2 +size 706623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 297c2bc622..76d27bb5ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd54ba7e2d621431b6920573ba68387d50e4c3162c391bc4053cad759f513880 -size 599264 +oid sha256:b620eb9b27ccd04e8477c21cd5803f85f2cdde2b57743ec5bb492521db52104e +size 626547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86e5379a02..a23396fa7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fc3e68f0b3914e8213d04242409173e14769910673b85f8f89b77a1a7d0d290 -size 733383 +oid sha256:252ea5c3045f6fb1c85e3e6094c6910558e9dc2d199c15cd3d9d4a86832831d1 +size 758741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 207a008e87..04087c831a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32173a19f17c1ad708cabde596e02dddd1e32dec4f730d3b876f4a0ab05c7921 -size 682569 +oid sha256:b05dfb2f63e58f1867f632f370002631713b2425c9af0bd117e7986119286c74 +size 716953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index db0b40eee6..2cf25f2245 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07f28a0093c2c74c543ef31993e2bba59b258d3ece5ea9db9605679d8793bd86 -size 716801 +oid sha256:44ca7a5b8b64a023ea48b710eb5d2af87bad7a2dca766bde5d68aa70d28e9521 +size 745069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8fa4087e49..d764457a80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de2432c84b8e18ab679fa888e197548fa03cee1b144fc1c5341d113c68969ffd -size 672499 +oid sha256:78876a6785f77590b54fb0e104aaf9e84994e8a18c2575563b5481f6769bb023 +size 707821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ad88a796ff..8d6b62b15f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51c6df98f6f023c16ab20d0d58e4e295a31f752154afe32cda6f3f25d44ddd58 -size 726407 +oid sha256:4e0d833d9c801b2156042495ef23ddc672d8eef71a182a715ae70625535acfdb +size 753935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 03f7d83130..2e89974ab7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:342a51fce2c42f115011c4c8f53b4779f5fcea99d630457763e34ace23accfdc -size 630447 +oid sha256:894615bc446dd7ea93a94899fc9a4f6db389ae3ba988608f825db54aed1dcbba +size 645789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index aa039b8506..973879dbd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f41cde63d2b6ac14f4f727187ea5f11a6160a4f22812ed6f3865ef3d5658eb -size 667057 +oid sha256:3d4abb30828c9f81c3cf9aee737878d17fcb018a4a067ffc52924e5db6d5cb73 +size 687037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8961c790cd..ae24b3698f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53da6dcdc1b4af50a7654521bbd37aabe7515ea45e6a610cdfc4ba04065a8e75 -size 585748 +oid sha256:40ea14bca6fffb6285d30729c985df8dc2fe1732860666d5aa6802e76eb6cd77 +size 607948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d75ca1838b..0f5ab35c04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:face4844f4efb67d79e016f1705ce853e155d59c25f347241cc42462fed15bf6 -size 767261 +oid sha256:8afa7f34b4f1bc989a4b0ca4de7fc2d75f7fc32d1a543bb91d7fd861a4c4283a +size 797255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d2265aad0b..ce7af23ed7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b85ce7a0ff3863490454d544765b06c1885a50b645c848de91788e372323432 -size 715015 +oid sha256:51213d1fbe627a56ff524036443b11df9b32d4684cb2a193dbed56e0c73cc927 +size 753939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d7cbfefed9..128dc8a00a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa326d2fc36e565343e9e9db57ac64c6b05967e40cf098e0ac6d154071231767 -size 750679 +oid sha256:f6f61bc6dc17685088aef35d64b21ddffe14bd7e65ac1fb031d9f9cedcf3ecd4 +size 782843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f820e28d13..ee87dbef33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cb825581958ab716769d3bad5c8b25c83ba28e4d184a8b8a0db849a80e7a83e -size 704895 +oid sha256:32821461a1ca6b9e872d32c1f16f1fbe4742b409e4727a8f122c94b42b6abb4a +size 744805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 455607b67a..b1a43df38a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76e2404cc95fc736181eeb3683e205a1352804194ead8a7cc27ee3494d893b91 -size 756387 +oid sha256:c44aff38eddd344e0bc3d51ef911fe2a00d42afd76f5ef515df9e10fbe2280c2 +size 792203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ac6d30f013..10a2730b30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6aec7233bc2abd06860591c07bc580797bea73de6d49e9b544079dd0e80a625 -size 652977 +oid sha256:2d6627fe8ac665f0ba7bfbd3e55d1cc0bd5bb8706f666293e53222171bc7b43f +size 674189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6c29b59670..f643b14525 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:148b70904aabc3b6f4952e73316408b95c61581b7b33d000e04603edf1c81369 -size 705127 +oid sha256:6243a73f5050524651bcb2336dcfeea896dd58f1cc0f4ee039f0653c6fa0c7e8 +size 743607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 38ffd5950c..e426b16a0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd79367fd066a27d4651f5749091fa790488374e9e5a28e7ebb7f06cf8de87ad -size 606698 +oid sha256:87178a6a45852c0e68d29f8812743739367cde322cc338cbed9dbc4cf6452db2 +size 634819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index da6d2b2051..5ad7611a5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ccd6555a7512369b8b2a015990a3e80f9d8feeec5a7fa75b5b1ee9f6277eeb5 -size 753743 +oid sha256:c9a6c55bad24fe81417b7ad58d7d1030b60d64c1c09df3ce5aa5a21ff6b44ff1 +size 777917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3214c39006..da9cbad0ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67f2b9ef930bb75e168410bcc3148559ce57df2c48c3028050b9ebbf4890b2a1 -size 701497 +oid sha256:06c9a6a6bf4e161a3605b9a905f8a10cb52665e0a920898720507a6566cecde6 +size 733761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ab87e07958..442fc94b53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da6d213a6a99406a1330f3a6e9ab9f3b7c48dc65563e7db5aa416182aaa8addb -size 736371 +oid sha256:f695541394270ea6f63d5be2a663eec588df664dd0d61a935a7c73665a1e3a1e +size 763455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 615970181e..1f03e6227c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74bbdb8edfec11a97d7f44917cd41d0ad018205e414dc2cb2d0715c7bee48009 -size 691427 +oid sha256:20c0730b6ca6cbf9f83e3d907acc9dc4d39aa7b6512e2abd1597a35f4014cab8 +size 725417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3fd48d0d44..f33689e9b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9b3034f4da94171528f3bebeb45979fb1d119d16d886880c7332c79cd0300bd -size 742869 +oid sha256:e1c9bf82c5c7bb3949bc4bc11b7746bd86faa47c743d81bc6fe853bff53ae2bd +size 771977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5d8be2e28e..8495cba32f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7cc80c6101eaefab39b61961ef19bb322e1e6a279ce9d05d601f568beed1feca -size 639509 +oid sha256:14668e8fc8c723316f70ea782810fe7e297611a6a402a800e5b7115b0de096b6 +size 654851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7f52ea7b50..9a64b20cba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ac2859a82fb9c926a553dfd2d529204b3952d7c2687480f74fa6c5a24f0bc4c -size 691611 +oid sha256:daac426adcc8393a2989de79446bf37ec51efb29e14fc61cb1cc41921c7ae4cb +size 724171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 194f860e78..fd5cfe280f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:998da24ee58522401e8230aaf1c6006e20d5a245216407c4fdca6c5f47c10da2 -size 594020 +oid sha256:c725b82de1ce3fd12f9fcdd1babd608611da73df9805314ae3ac863b03f0cce9 +size 616220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9629b0d4dc..7a05b2dc77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82d4cf87b9c9e3a0379f1e1e9e78c15a966ac8dde95b8438a3011fbbf7016537 -size 704279 +oid sha256:61a40febe1441917d34d186229cb895815b8dee9416510d78539f9e7e1142674 +size 740785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c0d31f45d1..a3f5c4e2d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea388eb5492f8d50f2e8f19675d5b3e9d3a1d07c9a0f348ae94107f4bc148ffe -size 691693 +oid sha256:e219e2210271119e2e51049824f1a1b3cc2ba3cbe21afb97970900b754f78efe +size 721243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index b6008c65bb..f1d982135c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:402dccf9fe50ee38ed0c5553241ae23347b60e0bdfea02d18092997821d67055 -size 678113 +oid sha256:507226843f18a95d4893dc8177411d365947f73471910ce722ad226c441f2152 +size 694097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fe6a6ba478..145a1f5803 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1227e970ba3d448aa6fcbb1024578246c3e30a4eca5761565a1c6ff959307269 -size 593398 +oid sha256:42c12ed82b8ca6a74ba23ac7e8bdcb8084d4573789f9c7bf56ead0cf77439b43 +size 607310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d796a5a681..adfb83d835 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc7da912d509e15dd8854b6021cbff9e2dd9a4a04b4f55f837f02adc5e128a6a -size 697125 +oid sha256:2ea35a45a4ac2acfacb4aa9d8e205c39ee75f51f768761804175565389ac9a4c +size 731117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8e0652fbba..d92e865264 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6648baade9be2c22abaa6701c4f1ae8ff284ae3ca836b4d091fbee2d6894a950 -size 684539 +oid sha256:6f0dc940f61c863340a36866b8ac154071452f3eaef99d9a36b34655af83830b +size 711575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 0dd97b82a8..5445eb71a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46b6203d016d94a0b7850c69701eabc4a05b70e527d4f520f14d1c741d835b4a -size 670959 +oid sha256:0fd56be561db7d48f4b95f2bd667015081ceab9e92412e4dc387d3945aa6b29c +size 684329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index bbcd193b27..3cd81269b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:827824d270d5f9595b61ded6d44d79ab1cb3dcd40576e4cb0bde2cedd1805f09 -size 586244 +oid sha256:9b9c5a49c57b2d1d38b4194cb87f21dc8ef6f1f08df8ec75c4074abc849be3fb +size 597592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 089acc2d6b..01591731dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3779f179ef30fffba1298948322a41d115baac6998e1150b1a2f8a965833be1 -size 625293 +oid sha256:6ad49680f1908e85455e1e58b1c098956f9ac2227dbf67c1509ee2a6a4fe5618 +size 640241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2222f92aad..5954a3f814 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6da0ab2141b21c8597ea6d594be271e2751491c16ae38739a4f81d2f843e042 -size 592778 +oid sha256:9272dd514be222420dd1797eaa73976bd15bab9f64194190661383e7d2d4b5ec +size 615078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4907341322..6385e7a974 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37f475d6378b24251c75255df2b6fd99653184a03a2ff5e4238a239a5615bec4 -size 618923 +oid sha256:9d847de715a9dec9d3e331d5e655556e2941494c249052ed1a46a64024db433b +size 633969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7b03537125..7d253e3c7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31c8e919706807866c7d4559f4995ec5afb02a6e1e8e99d044d7a5795cc06017 -size 587248 +oid sha256:15d514aab9de663ed99c84cbae4fdb330c9afeb8ea02b137ebb21d70a5371a5c +size 608806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4954512ea1..bbcd856f6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af3943ce98ffc0ae82dcb6d289c8c6d232f1947c114c7d12c2c5ad3ccb486a6e -size 624039 +oid sha256:a4b0a79fc39bcd349cacdb47c77b56cfe725632a290dd6b2f8ecd8fdbc60388a +size 627985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 07d8d8c57c..c7e4152e16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:012e45548b2cb04c21a05c99e6650b2dbe73091893e3a32802f40c1dbbd2681e -size 540608 +oid sha256:21c1e5376d227b62f41b1bf30612e720b54826c1be92fd560f06e6ff9819eba8 +size 546972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 98689fcf42..dfc08720be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a25b52edfc513f63ffd1e65495bee61bd1985b4e5274ebefdfa25a5044949232 -size 588664 +oid sha256:a328d43e959aebdca86f1eb215fa7463bacfc8fbbe0463245441a292b7bf347f +size 599270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 404724ab6a..1dffdf744d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36fc76bf48ec959104ed4bbb039839e046a443f7cc6594d8d4c96910ebf8adae -size 505530 +oid sha256:5121086bb4cbf5f4966ac470c8bbb8b1c1c912844fcca2f34b1c4c6c02c5d471 +size 520478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 263cf6803f..d9ebee924b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73f9683303261370594c12068ab1bed462efd7a733c70b92a7ac8194aaef443b -size 618139 +oid sha256:23b6afff3d8f187d5d6bb2f4869251af933e00f4b7411b2fe7e6a0078b3e715e +size 630521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6b78e24203..5ee5d18176 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d30a8b767a2a281b0b04261f3e04d07da6a181bab66b8f86f6d9adcaba9663d8 -size 585626 +oid sha256:f96ff77213ea0f129cd2d9e9f088d915042b94a77bae6681e41ea688f5aab219 +size 605408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d3c3b07dda..52845594e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afc8337d74208f95d5656c5ed45935687adeadab9f4b6400d5ef76ba59db18f2 -size 611768 +oid sha256:be905bcdc9aa047452289f7c6c05334d8f8d3de7639ba15c112b413b8ddbaf0c +size 624251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 71cba1f526..c5c58209e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:963348a5cd17451ba0569f98b2b81d6767c6059811723199a0ca175c49f40a19 -size 580094 +oid sha256:09824a0a5a2d4a071c821cac9189a14a80b2a880f0bb7f6ee4a75fe69f806892 +size 599136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 16115f30d7..40d6ce9060 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f84ecf0c8dd95099f4edf1198e93d271d9f3008c8f8a7e09d37c06157d9c37 -size 616194 +oid sha256:0361bf319ad760f72aa5dbe92f25637c05589c29d8786ef120cfbcf9d47473b6 +size 618169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06175cb2cd..e26a3ce4ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:910da163b2201b880d5380a99ff4fc5cbf737e49c9534177875e88e090919c04 -size 532664 +oid sha256:af7c6cf9a5c185265e040d77623ad1a0491ec2b2286d173178fc730e4c466332 +size 537302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fad2514ef4..8d78da200c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f7788799bb931a4033e33f02a322af4d5285f97b6f97317c2b093c6080da315 -size 581610 +oid sha256:b86d727065afa48b4f893df23becc7d8c3ca1eeff11dfb947a295f5683b58d7a +size 589650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1922daf096..c7d094bf8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69a0ed6b4ae10382c0f6c5aa7303bca5f7d3848ffe9b4b46f9a71d475ee359f6 -size 498376 +oid sha256:599a3c44e850078573f0b6eafbc3828e46a228f832763f865ccd99e8f28253b6 +size 510808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ca76e7db72..b4f6e2db23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bee69565b8198b39f4b9f90228839c183c6ae22516e31a5817c1d07c1414181 -size 646293 +oid sha256:aa12e6fef9dd65d050c21a0a49b1b86e1ee983451d13a4caaf35ace5109ff4e8 +size 658627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9ed9a42935..9e6eb46b26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb1f6694dad4d55ccd3856055f74f9464f4da85cba22ce97edf5a8fece4cfb4a -size 601890 +oid sha256:ca9d848b8e2b40d0bd20d1979fa022ffa6d91b7a611e8b1f277459898ef32cfa +size 631885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4068ae607d..1a3cb6ffa7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bcb1de20ea98d596ca26ace3a54fb15c5e33d76cf6603001d23082513c19314 -size 638345 +oid sha256:a4f275bfb152e619b9e9c20917621cb6b9de1df70b8e6b675213892125887802 +size 651565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 470dd0b440..ddad703d53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18eb7c8cc73323aaa2824807eb328a8b4e1d4af0b3e7cf86701c8f61abdd8832 -size 605238 +oid sha256:fc21ac169afdd19293c0a9131c7e9a786624c00fa52245b67f8049eef6dc5695 +size 625615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6058e35b77..5d0fdf6353 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd3e47e1199a90aea0f77d65b9798a83b4ec1f601a9ceb46096c212486a31ed4 -size 650071 +oid sha256:47a28d55893896c5be37411cb807d2dce0ab5d77fef03f557e69d094e3218f40 +size 662751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 530fc1cbe1..2cd10b41f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:306db80a84162f868832cbe0bbab5b95d055931fca3b5a6abffecee4c46c87ca -size 553122 +oid sha256:854a1725b94c8cc21a2574a7d7b6e12949db4679a7084ae565003717231dfbe5 +size 558254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 87bcea5206..50fb056835 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:706b1f784a101f7a3a49687b4cea972d19a16180e73338d66a1b9a847d84cbba -size 611540 +oid sha256:e4d268bbe5f6d102a76326a38c02256d320ed9610ea3ba48261cea08d3a85467 +size 628561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 566c1ec2d0..22739fb14a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a951ad5b5aa3f73823c8683181951c3e2c2f682835316dc19059512a8ca9d2f5 -size 517256 +oid sha256:8bd5e0413b07761f14cad5af4dd429e6464a194fcb4edd37ddf3f33d7c64b7fc +size 530180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cdb1d58179..5d6c54df0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:661d81a89d351812814dcea3b79385edc032bdcdfc81a3a8e480e02a9a60de8f -size 639139 +oid sha256:dafec6d927df7742516a66d22c61b7dfb554167d170b51fdaf15233935dc2dfb +size 648907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 343bcf9408..a895bf49ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:007af2cabb0dcd627c41189140ecffa69ac82b88546e4e8491998609e7713de4 -size 594736 +oid sha256:fcfd64dec15c7d4b56f01fc2c194aa818b1abf2a87d69f0afdeec4441a7f47d1 +size 622217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c45be5b043..5f78d09c70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:798c0ceadca26614b56c5f0f681bb1bfa5e44f1abbf760f35d80b9056543d95e -size 631191 +oid sha256:5bca479ad6f27116d5a4b6547cb4ec715154bfd349564a00bcff5caf8074f733 +size 641847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c7d2a52dde..f5a5a24c86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ec7998a85380a04f966d6acc6c455e5013c7b65b7bc6bdc0d9e4c66314e9cff -size 598086 +oid sha256:7a40f373c436a7fecc496aa74911106561cb1e9161873bd414cfe625761c6f7b +size 615944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 68abba21da..2cf244342b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7465e455d920807cba786b15b0a0d51462c32bfb7cccdd7affb3ca9f1c46b06f -size 642919 +oid sha256:90438744ffe4e5014e8febe9f6d01b0b018d151b9a73b9da8c1d7ae5b6f17736 +size 653031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d3d4bf5e1b..8d59477bd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1620992584e5db3f255383f708f001884880d9ffe4576ba36578be0b0afc01fd -size 546018 +oid sha256:12a780e438d3e3ea9b9f0902a7d0abc6ce4d7b4d250397fca8c764e3357a4905 +size 548584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9a37de9e14..a79840347b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9db588bed7ccc7c1dcaabbe917824aa9944fa077884454a0371358a5ed328a51 -size 603596 +oid sha256:25c3002fdf77ef51386ba8fb6dfc58894b89744a6d197e09d6842f83a85962cc +size 618841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 28959c8c53..379fb9f767 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b187a2dae2b5dcc7535353dc1a3358fdae748d4d9c6e6ac875f98f48f2d6bb19 -size 510102 +oid sha256:955c04e1d661f6db2221d7629d8a8fe028fa29473f0fa48bfc563c2dd79365a8 +size 520512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a2a898e5ca..bf3f0d3cef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39ca7e21f70912a8e2789110d09ba61e207f75d3115a9d7f8ad3cb753f9f9cfc -size 831215 +oid sha256:d20f35ad9ac542b582ce03f038bd289f562e38c958376f055b7fa7aac56d3ebd +size 840293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 31001c965d..7116038ddb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7213fb15b3dd14b0a6b2f7f629287b83e6cadae2da980a400558348f0f241c5 -size 813745 +oid sha256:8a10311082af84a4d0e63024024c1f0aef84fcd9fb710000255f23157b714e02 +size 826425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 279b9306ec..b29de9fa65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d118d9dcb337f3b2920b2654d9343b4ff6288483ef774dede5295bb82a04f0b6 -size 793949 +oid sha256:4521674a36fdbb35747d96debacb277c8393b668d2c98253ad2e9aa29efc33d5 +size 790299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 27b8bc59af..4610e66ca1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21570bbe9a2f6e2f32d054a4b85602e2073b026a1a7bfc5a22e9d0c6d7512dc6 -size 708199 +oid sha256:851b0b8bc2e8654baeba7afe76cfdc89deed21d3480254d677a58d5b66c7148f +size 697791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 91a1dc1121..7e7dd85182 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b69104ff4560a7d0cb1aa823b166b387cbda32fb81d2c9df7e0bffac407f53d -size 816959 +oid sha256:4fd61f4e99eebd3148d9366d272b076f5529c4a63af526590f8dc5cd2180f467 +size 817945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ff57810e1a..198a99fb76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ba790304b7740f1eadb500f15f4b4a9ba70310172db9c964f626a5abcc50452 -size 799439 +oid sha256:3fbe6cfad9bbe0eaf02da47f21bd221be155c856314d372a6539b0e33d5f4579 +size 806099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 2057cff970..9bd4ae57bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26c7619df8e0fae1fa59ea38611bfeb79485e088006f7354772a6d9569d3fadb -size 775005 +oid sha256:00ffd59da8673003e8af119128dda5a7b75d35cbd253b36c00153ed9ea944b55 +size 768247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 5c088b5625..8f9cb4ebf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:175c3447e8d795af48bddcf673f75f43bdf82da2f2ef186ea7f3a46ff4360003 -size 693893 +oid sha256:f8a5cec5f336bca75470c096e34639ca585962dbbd8d04ac0d71aff1185d1cf1 +size 675443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 870bc7b71a..bfec09c3b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:273c1db85e6b54f36934aae48d83505ebd25f06c1b1b6f71c73e1f7ef69cfe0d -size 685135 +oid sha256:6c4ea89f6941203161dceb4f4a978d57491b4cb3162f7a31c7486884f4edae6e +size 717745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index aa4c8938c9..00a311fe62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6430a9350541ed2a4a5e9b57c0a4860e6659aa13a3c5d016d9caacfe16c3ff75 -size 628105 +oid sha256:ed296870fc05e8b5dac689267d7e4b6d2cc279295a0fbacb16961c8e7cbc92d0 +size 668409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1d03c4129e..6ebf86d806 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddb8952d77e694953298ad9d9849b95e1a935d5fe27ee94bb413a2c9a06a4f6e -size 673537 +oid sha256:4171e7e80a38331a5f4dbc614881af57c40b9a0ba8eb317fc3d6f0470d5e2eaf +size 707133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbf38880f8..0087fbc19c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4607354446fc670c019ebd68fa56260d414d493d35f23023884de48bf679a27 -size 622771 +oid sha256:75150b1f487d4ea3ba05b5f80cf9eddb988ce8aa5f41c4ab3d8110ac44b7bf07 +size 660263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 59f0f71f7d..9e732e7fa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c07aab3b54c11a4fe71bb209b1575951333c265852b654833757ed5d576fd8b1 -size 679443 +oid sha256:1995fc9e0cd8e8e569f3ec362ed12e2de1698f4fb2390bb65ceaa035e02f2b42 +size 712495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 82864eced1..f209b1f67f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5c653ea1bc65df50597c28d97a2f62c75ddf20a14d21cc6ddf26f51273da383 -size 584862 +oid sha256:b490823f58aadbd0e01c7e7e53817d206f6fbe47f9f1065b823918d4dd96fdcf +size 608294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 159a2d6aae..b864dd4a1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:402a34874071784705f9fbdae3f2d6fff64f17ed4e4429a126e0ed9db4cac26a -size 622017 +oid sha256:7ca8485259025f43caea53ba805391c4098641a436add05d8abf497aba568900 +size 648261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 53fc873b6b..8d31e69d6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a82a693cd81a0c72c127902e1d064ac3311c0a202c7e4bcb8d7c376dab614fe -size 540016 +oid sha256:c2dff9d8505939e6abb3f67a3c7efee0906c39df05c931156ee91d6cffe13070 +size 571046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f2c9c60cf1..523f20ea19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27650cf9c91da950b7268be7661c8b50eb06ac0110c97f1ddcfa124d2ed4bad7 -size 670829 +oid sha256:2806fd6f2a072730df9342f926b8b9bbf7aebad1489762728b0c81df56ea2ac9 +size 693917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9d7ff6a075..350109443a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17c896b82cc9d58577441740e61b58d0cd77004eb12111b6fc97be8e236321ed -size 613796 +oid sha256:3cfad2bfce66d3504b81254aecdffc4b68d6491ea46bd40abdc18663c3ea0929 +size 644581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 101819c6ef..4aef8777fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7930a14fcd108f6b7ce8708f3a192c42d2c4158b5d6fd1e3ad5f45e3a72b041 -size 659229 +oid sha256:a6bf8c2cbed339ea64eb245f4f702a316813b51f8ab63091e27ee89029a76e96 +size 685179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d4a891def1..d54a6ef601 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:421b984c960e7249a2866f8e530805b604bc9b0370eaf470a87256f38c78a6db -size 609252 +oid sha256:5c9ca81603df42c4fa87883c78ebd193f82d18d6635e16e2f6ddc9c6eb9cc25c +size 638311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6343b10767..224a38d383 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de23726404029ddd116bd7bd6bcec8d09ae0ef373390516e54d6ca07f60f3751 -size 663557 +oid sha256:72df199c25a0975799ec8e9605c44b3182e5261272ff0a8297f758a32dda7100 +size 688519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b2c1f0e20c..5828e61d1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:492c1a949cfe2553e235686129b01dbc7ab73e344d07170b61c4d69cd120bc50 -size 570604 +oid sha256:24d703e3479a708c4f4038ab22ddc055deb1cacfff008c60ac54042543a8f635 +size 586934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a70912b395..1d568414c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a845de9dc1327cd41293a357433d1971883c4624ca95d87e4e3b3e126f282ab2 -size 607610 +oid sha256:0b6475574bd3e24b7045f3508686266f78431d7cc1488fa02c4a0036e28ea907 +size 628825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1ffa9664b1..8a1c7ebb50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cb0d70a1824bce5058fe8f4779d0459d4163ada5228458983c3109bf95b31ce -size 526548 +oid sha256:4bb5a3049bb2d0f1db359f63829ee730c81ffa9088ec66a36c2a0c84cb25a438 +size 549784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f716277111..004f7280fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcda9f2e31e45b19ff1c45ec1ca3349b013ac0ad9da106d491c699042e480002 -size 705693 +oid sha256:e420ae9f6d5b2e4cd8e79a362e7f548689c054288bded9926e90a17f086d5986 +size 735341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 84953501ca..1554d16b19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:851af201d34e318e54fa41c10739794b6ac464207bb0bc646f19bc88eeddc46c -size 635539 +oid sha256:59b421d332cca18ab7e8740d7c74f04e4e8987725207319bd55d7ddc8863c23b +size 685217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4a8668ce70..e34694f001 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7090f5d816c59c9574d71a4b617e5f994d63bde9a9467a490cd2f9ceb12992e -size 694143 +oid sha256:db81e3db1eb458ccc261131ed0cf23b52894a41b749075ba7e5a56db420b163d +size 724729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f7c6974726..e86f922313 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75d44a85da9637ead4a4df1a5ec9e5643b8ccd0026f5b6efeed437512489065e -size 642537 +oid sha256:c65a19d17de4c6d0faaaf1eeeba584af9d186ae35134c240d9921f4f04b86082 +size 677861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9866eea9f7..eecc93e36e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22fb39284c12c9e18c6be7db4d3b7cf9ffa144afbeed9f4b0bf2ecd69fdf8688 -size 697137 +oid sha256:dad7a5d76a5680aacd0a774825038d9b91a7401d16c8affb0810a9f6876c0e55 +size 731325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6d7394889..890169cd9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bc7684badf81ccd39aadeed48322ffd69fe265252858f528c6e2e401d6451df -size 590026 +oid sha256:8f741dfbd42b507e2da27da503bf7d6b135cd3dd33d0d5313b852faa41614f6a +size 608280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index eb07238ac6..73e315c7cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14b3a40f28811cfc751fa2b39d49251a1deb0db9cb22d4184d8a8920bd6fca7e -size 648543 +oid sha256:b39e0665c230267b3b2da9211c378d65e0349fa9bbdcd6eaf72519af7096f3e2 +size 688799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 32a405cc2c..185f1fb78c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f60a248d1a7b3e1e808c2d6031e3c73aeb8de5ae94187722faccce12d997177 -size 544638 +oid sha256:f75e03eb601d62ba2365ab39e30c25e8e76001fee3b650cffce2f3c67deed000 +size 570438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8f41e4f281..8846e9f9e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f137574d33638b94f0c192e0c30cd8772b41f45909ac56b3c9cc849dec2b8934 -size 691435 +oid sha256:c038eb0192acb5b6e5ffc08ef5611d58548e5ed4a31728ecd5aac2d1ed58e94e +size 712303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f7c051a716..05231ac0c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bd1074e8ff162af8b48b360bd1f128b470538f9cc58c549fac63618b9667522 -size 622021 +oid sha256:9f1745d310b2df730824d42ef8d7085aeb8ffbc8b168a164ad687157bba74ab0 +size 661389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 70ca0bfc3e..03b6b3ad50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:944ea5b903b3d68b37a2179f0e3260645416ac15624ce225a92257b5b9c5fa71 -size 679835 +oid sha256:686b2955b9cbd0a2ec784ece262ca8efa6f2a22990b5b3f4b98fb11c7350bced +size 702775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c173744656..8318e8db46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b847596c9322769d6a6e0f9ce370602052fcb8fe5b8c59d48e8d1f945cadaac0 -size 628231 +oid sha256:3c77dda7a7126c2d238b164ed29c7f640cab09d8b6590d82e6685513516da7db +size 655907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b4e2ff3200..d9a429a9e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:145021a7b74dea7389e6b4538b9866f2fbceaa4143e670d9b902ed4cd776652d -size 681253 +oid sha256:68ccdfb98f68ac96d759a385c010a8d7b5db29d1bd412eeab0c92d568002ee4c +size 706461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aec687ecf0..22cffa11c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0e1ec666f010256f8b6c37a9091467979fbd03d9467b71dfc4194a4db8526ab -size 575720 +oid sha256:7e79506e0fb547c023e92a402765fa343a95276bd4348ed3a2356eed516e8a61 +size 589680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b4f653d065..71fb0eebaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a7ab3f0452a5808f80ac60338fbf8d3a11d2bb116df5f0757a1593624e3d79b -size 633151 +oid sha256:85da87c6b35ab9020044e1db0902f6d0fb96ba592cc63f17123677629f64933a +size 663983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 11e11dc929..4d26ac395c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aadd8c26e00768270a533c18b17171cdd5af449447813b4d88887ce214d038a0 -size 530330 +oid sha256:a575f61ad9adec24813fb289fea5fb6c906c8a549e643b089b2252294454ea2d +size 551890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 84ac9f73cd..c0f28f728d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03adc2b1e654065c71c8c8ccbc817bc714148cc59df0828d132ea2aea7c8d484 -size 824059 +oid sha256:8a51975a58b59c85c994747724d8acbec6b48cf514fc56bee11ccc4b8ddf3263 +size 886663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b26f16fc74..460dc2be00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99106dab4c18e07d595118731395dd660ff6141607765866b5f2eb6f4ade43bc -size 771813 +oid sha256:8f3b42af7211aecbe74c894f9633726ef11e217efd84ed74b8a0aee4f6506108 +size 843197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6f1c61ab51..4b282884e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad7fd074b7a4b33195a03f4289a4f7f295766c2ec6a9867d93d400a71252e5af -size 807477 +oid sha256:eca4dba678057806029f12e0234475cb0fef104f62d8b3d5f96dc298cd447d2e +size 872201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1466c22e24..aaf734d787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08afe8d5ec4bd269d9c2e7cb781aab70b85b971034610a0776c7495ee37d61c2 -size 760953 +oid sha256:d4a878ad4aa782847708e301fbc05412c03c5165b05c372542e8e7eca9080f0b +size 832487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 83aa69ebaf..9302725e97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9ef67d3154a358ae897ad3569f8e16e4d477f84ce3f950a845ca909bc92f516 -size 796659 +oid sha256:2558728a0d57c4a7794285133cff2741966308414b31d9a6f1e59a376843e3b2 +size 863555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4027f0d9f5..86e761c24a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:599b0dccb91a4118c6cdc56341e4b30ae53625fb959b5fc80623b4a5aac0aea7 -size 720875 +oid sha256:d8021ef58077360ce8129b74ccadb4099d0477857249e1346423a3066db3ad25 +size 775339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 78ccd8ca02..a323155445 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72d13ac9940c3d666268cfc8385bb507dd9e46be9fb0a28fdbf7cd40084de4d9 -size 734249 +oid sha256:8d3439d6b9149b3c95753062b3189ecd9b9082a05506f43ed0c359079ea6fd14 +size 794387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8269dfe180..487f12fa65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47e67c33dea4ba90994bc1889ea4c86f89cc9eb22beb38660f4e36d164ecd2f9 -size 673611 +oid sha256:3ab742c03f42c3f85ec4035aebcf0c22d2688a67f2a2c1ff417673f5721e4685 +size 735771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 507786fdf2..f726aef88d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e57a49fb3445b472a4971361aa15fe3b20ff2effa97122a5eeb66d716ef6b9a3 -size 797073 +oid sha256:267deab746e9f7e165b57f88a45e211f50cf0eda3e00fd06e5bf753de396c14a +size 841177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d941602e9d..cf04903217 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd86844e34d406f834473ae53dad8a24178c7cb7ebdb740ebb4896eee019ebe0 -size 744827 +oid sha256:faba40112f59a121ffccc9980f3a4fef85c4e0942d3353d612aa0e7ddaa87622 +size 796923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 25cfdd1d5e..16dba1f987 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3261b719938da55bb4b1258822521bccb2d2306a78c42bf147142d8861b57fc -size 780491 +oid sha256:d8abc50d79ce371dbdfd61b2cd6b55a0edb2f9109b1225730619eae0514f56f2 +size 827555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d7151c5c80..dc05bd1e57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2f3075aacceb2fe92fca7b116f3cacfc26be992519496c675fe7b073b5257a2 -size 733919 +oid sha256:5927ac97240e4b957f2f80f1be4da0093d411098d674806ae8788436da33f80d +size 787001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8241ab4679..06d9b048d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9389537824e4d079817241833925fc22af9bec34a742567ebc1a5d4348c5dc10 -size 768883 +oid sha256:754219916f4e445d934b23113c4c06c85994323885350a38b06cd94597313265 +size 816491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 46ce7e6856..2d44e45e7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d21d66688324b70e3d3086d6ac2d5a750d45baa794ffa483e4016ff7396f6c4 -size 693889 +oid sha256:4a908b18a3ff15b02476876f3e6f4926f9d1be524f0798b9b8464d5998f8e6d0 +size 733455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 05945ff787..e9529be362 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f28e28822325e8f67b8f7d4dfcfe5c2ba1248d1cfd5d64e3d9d415ca40e6245 -size 706525 +oid sha256:92fe38f4b407dcfb4936c6173bba89cd6064f65f68be425244d2708381fe7e97 +size 750777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8f831ca16f..2d1f282e81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:899cba92f8126c6ad43f34928bb1fbfc1cd1b74e7f6906b6ad2c8f8ef64ffc93 -size 646625 +oid sha256:a86ef598ada2630e2ed00ef2c078162edb31a52a4e6849fb8e325849f0126571 +size 693097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 455f06dc30..d78bf2c244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4251779e93776cde25d305057b9c433320f7836254a60bfa6047327414467bbe -size 844517 +oid sha256:7647dc38acc8c80740e9660b3e312ece4fb69eb0b68f1f0492cf3561a448f1ab +size 907071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b8f9bb48a..f58404f48b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e25042c698a05583b42a7dd23a0239568b34234a8d01d4876eb58ac380fac95b -size 790643 +oid sha256:b34ea6c4a856866b9e1c953a4f6bc8ced756a20c8b256ea45ed0edbec82990cc +size 861979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 059eed7863..7845314d3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:416fa702f4925944fda71369351d35c2af5b622825572f8c32efd8683fb731b5 -size 828723 +oid sha256:ce98c7a129b45717f1d46cc52bcd555995206888645415cca2b4680bf6996351 +size 890933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a1b7b7178e..cbf96209b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b7d239d308dc0031a10860006bb2ffcb4992dc4b9d85faca1a2e61ca7ee8c45 -size 780573 +oid sha256:16d687d914a648ac107db4e094d051e2f2b002a08d34aac2b73e9a9f4c5e7fcf +size 851169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c1e35beb27..05a27a397e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7e40c43e146bf98c7338343662f230b60c462ae60f7e7ac1191deac9e7664af -size 813959 +oid sha256:78747fbe7392f7a5a060e48ad0bb80cf34a99748d1b0c9af0b3eec28d5a54ba4 +size 879917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 541567e496..9b33b03a8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbe6a303831da20d8ce926eb6ed0df131d8a8dab850fdfccbf532ef80608e353 -size 703839 +oid sha256:98221acfd17931e0a55a9a5ad875d33885214972344d2fc9816a38b6c4f023af +size 752531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a8aac8313d..1ef48e5d07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae5614c49e317bbd52fdb08ab50db02f7ba6afa5360ab2da7faafda10e0a93c6 -size 761171 +oid sha256:3d5ed8220701313a90617da0cf00f046c1a6fce7c8754cb1a9c12818284091a9 +size 830483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d9f061f445..df9c355f0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09cfb9de5b89322421b4ce28caad0f9a7547c89490b4421f838e7a20b1f56566 -size 656033 +oid sha256:2a718cc6658412c980aa26b0e69807b2fe12906156e449399d02b3b34cb11207 +size 710695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7b95faf1b5..48c8f889cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38a876e914a124cf4914a806de32cec7ec7522f32c82317b3ea4c3e97f530c32 -size 817531 +oid sha256:33901e920145d697261f393e6a7677d4b17bb3c9371a94510d1d7241e445e0d9 +size 859613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b61904227f..b5f7d40500 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:937de4c976edf5a55ac338b58b430dd8374fe3c3a197442520eef35107a1faba -size 763657 +oid sha256:1d48259b792792a827065fa3924b44bda5e6ad1cfd6db9d958d3ade9723c288b +size 814519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b039c48264..0992900e82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bdda64421b8649d472094edc8c9b2014a51fc0c6cf29f7e1ccd815aa3614701 -size 800949 +oid sha256:ffdc79791c0829748ee7170f7829713922b9df33736909833736a342e29478f0 +size 845941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a4b5af57ab..7c1c51784d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d225289025a244cc6e01bf0679f0870ac93e05647acd4930835a98ead113947 -size 753587 +oid sha256:5058eecc0c2b0bfbda6dd1aff477ec0e859a6b3643323635926ef90d5f4b72c3 +size 805387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 389828fcce..076181efd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88a1260c5ad4de5b91b13fb0bee6c2d1df330123b9300ee7e444cc584f273ca2 -size 785345 +oid sha256:b3bdad8fa11add5d6664080231a07d7f25895dafe463b03b4b98fab77c554024 +size 834729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a350f7be75..395ee8efe5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82f0c36a2d4f7f73c0de3260c7594bc8a4b20da90bd367851929c1374384c2db -size 676853 +oid sha256:c97dead7b64d08db9177ff81e0380fda05694b3da8a468e329b8aec8bf3ea1ea +size 709117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f434fee9c4..0739733ef3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02e9e83e3a7a7e9476b4f40d4cfdfd1469f379e29a93c37a8d016ee1f2fa6a18 -size 732607 +oid sha256:bff113cb72ef24db6474f4dea44131529557885d9439ad7068e4de7a2279d985 +size 785295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index acae621519..ecf4678b91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:879ca8d430280e06a24d7c16fa65149b267e92ef47c6d6857895a8ca3f63ef7a -size 629047 +oid sha256:503fa4334b3ae1490c58a18e93e2b4809784335b380cf662d3f7ab58ee308b61 +size 667331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4e614cbce9..7edc7cf94a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f762485f2e98b8c7f93ab5291033450df7954286cd6cb24b7b568963d253fc4a -size 730179 +oid sha256:9ed637c0e85f884d145e96e7ac872626fc4a6c249e09c5fc2c997bfbb917d45e +size 759237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 37d88dcbcb..adff264b36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27351a8b811e1706038c8b9da3802fda80fe23041c34ad34719545afab2a5f48 -size 721737 +oid sha256:33a1959e9106311b3ab11bea50b6f6e2b8310c635c9b2ef3ab53ff1a6f9f482e +size 737819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 97577a6025..ab35eff934 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f04b570bee24dec0facda12a5288a8f7fefefc5176707b436b67c419bb44045b -size 703025 +oid sha256:f677c90b26bbb46e10ea2e9f4ee39760a54304b0fee30bdbd14e13ad9e62ec81 +size 719355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7646a49fca..a6010ed6c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d3be9cc0c407d4740273f977079d64132616d2271ddc343e898736a90863cf1 -size 619003 +oid sha256:25563ba040768721162d6ace9c21cb699bdd9dab779f74e7fc19aa41d85e41d2 +size 638193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 60a6591c32..ecc175869d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d59aee6a4138cb9b4dd3f307f78c83e42ebcea2ae8092230ffed08de3c923694 -size 723025 +oid sha256:ae6f6b68ed78f99d8bad1651f3021163254bfe0174dcb5377acef564b161f4d5 +size 749567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ce041f2686..51950457b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66dd7c52b6f2906e200916c75ef7a3a1154fea65b08b4a7d56216722b6fdb038 -size 714583 +oid sha256:7c39b7eb1ebcf984efe301107333c226873b74061a3ae70d9394baf5c4119c01 +size 728151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4a2c87b45b..cb7f3449ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69beccebd55c1dd8cc68b23066e5dc1fd6fb0b147ef8d4fa7128d516514faa27 -size 695873 +oid sha256:3955fca73dea44268499ad7589040f11b332fec72f99f3d0f4e9219b8596df8f +size 709637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9a89f4586c..675fcfb376 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7af8a03cfc3fb02449b760996bcbfda3d2eba5f859ed857f54ee62bfede20c79 -size 611848 +oid sha256:a4c2a3bd24ce5f1997daec93b35f9154c2f4d475dbf630305245fe4c2e06c460 +size 628525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c8554bee84..682dcb5748 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9263164a6c6322e4c529419223d0985698ad2fbd39270163ef28978b41b7fd15 -size 638069 +oid sha256:a118e30aa87c197db86525c9eb4bcaf3a1906cea5fe21467ae245fbb48999462 +size 656471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1d00605b50..70ecbe8c28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c16ec7e6a3e8ffa805483fd235b5d71ede6c416b12691cc71a3f3bba62822b9 -size 603040 +oid sha256:f68af87ca700f3e5a7894a473f70225d41c3857e3459d4d148ac5f80d421fac2 +size 629631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5f6b44e63d..aa9ed4b9ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3344376f219df977e7d4b139e081525f6569a30f72270e801a3db2e36dcd9e4e -size 629923 +oid sha256:7e796886034fde935c88827d2b784e0b107494913b2b7ad8820ca82f2d0234c0 +size 650199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 54d114b202..e3cda604a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f0369649ce3188bceca50b906a638e45e76994e459da4383a31ad20a5e49082 -size 597608 +oid sha256:8b3a01bfcc1453a16ff6b2bbb55d5be943b5c17e0fa83e27a952f060e6d37af7 +size 622571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8c32f99a4f..59d7f1e7f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:353efafb109961700e5e9cb6a2d22b547ef968fbf5446747cf6098a0b3352714 -size 635337 +oid sha256:244c751edc557acbef19d8b47a73fcda63566d13d780ce913c267f4ab36f6fdd +size 644167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 967f67abb7..faea4182a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae89e0ae6c3245e81feb6d29c5d2e359d68974603d508e203b52933f56b9f20a -size 551362 +oid sha256:538ec56df72f0a4611b6255d7f75505f78a7b47f3bbe808e70370a082588b955 +size 563300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7ec4bcca4f..7fbbebc9a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad046f211ef820aeeb1e210e809a630a98d17fe2c98029cd4a67dad26c3c1de2 -size 598334 +oid sha256:8b2328b6f451caf2389511f79c890a3e85ad81cc5b42f37fbabb21240a9dfe6f +size 613824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ef715e7051..367203d708 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abf8da332db69791c70808d1393ffad7df24c1ecac7f4736d51707a72ea146a4 -size 515198 +oid sha256:b2be6ef74d92c5456294de200733fba9b7aaa7841206a1b01d9c25a59afe69e8 +size 534932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6557b8c20a..6bf0c9026d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69fa0d42b4602f1c03788198ef2641356af49be0696dd1dc4df5c843c4a9cc16 -size 630917 +oid sha256:e4ca59b71c3b1c8c8a866d71f19161d2202c08e1036254b9cd87d4e38addbbeb +size 646801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 49406ead39..658ed022d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b0785a41d755f30f590db953d1821fdd5062bade1e059804dba53291aa80132 -size 595886 +oid sha256:46303ead18ab87f0d69e0e4699c517310821e2011b6296b054f2b70db67086a3 +size 619963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c1a90f07eb..effc242153 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c107cf4dc7cef45a95d707ad1dbd8e637c9f094765c2a5d661fc7d731dd4bb6c -size 622771 +oid sha256:db97ba4574e3c976ba45792f6a46e614307bfea1e2f8f8cb762d963c03181fad +size 640531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e53b6aabc4..97b2e80daf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a852ba835ce9fdbc7646c1f7c2ac0af7534d8c752098d4a902758fdff4e4deb -size 590454 +oid sha256:c1532a2496fafcd32dd0def2b1e32f1c289243a914ed178aac49dec5ee7b6c59 +size 612900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7cb3c8f47f..fac9d30b37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f36db9ca488072f01040ef43d3b6ef2cec4361e414649a0f7cd0d97386b89b2d -size 628281 +oid sha256:bde9a697bb26c15a2a61d93ab7584a8e3de1766b1bebcf87b55af780a45ce243 +size 634349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 63a1abb363..fa96ad0e4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fd83d4c741f318a246f1cf5b79dae438d715745b198654bc92e4a4c7f0b794b -size 544208 +oid sha256:2bf3ada07f4d358f999d8d460d0946672b21ed2991c7e761c70af2db8a6078c3 +size 554420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 33ec6e6368..9afffca895 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9491dd50b5a3676134726e84637cc91ad90ce70c9d5c3a27930a67ce3e7c8da -size 591278 +oid sha256:17581dd3cb0d12457d8fe00a25c982ace24bc8e8f8c6bae5595c0e267462f4af +size 604006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9210ea3cbc..111de689a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa7c0adaced79286aaadca2a7f2d2138979679c82e17be57ba27f7db78e98ea7 -size 508046 +oid sha256:265f04caec0038780ec17c9a71dd2e9d5610ef3c21f89e9b400b5a53045ae206 +size 525214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a6a3b71b30..7b55ff4e4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4998e877c0731e2b075f253e8b13fbeef2998338b310a4b145622051dfafbc21 -size 658133 +oid sha256:729f3194e5239fefe0517c66136b36bd8bce708ed8748788dc22942daad835f9 +size 674067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7b1cbf1fa..aa53bf7af6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e403d75546020a2194f83437565e06db127292dfadfe4fbf09f2d05285b2c3e -size 610770 +oid sha256:6cc04c31979bd134f829b5d4ea423cab4f9301f052fbf42220cccae6386036b8 +size 646439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 66958df7bf..2ce3f084f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3999a747347f3ac02543597fba16d91848eb941383e16089aabbd7d991e5a1ee -size 651319 +oid sha256:7145f97cb46574525b045df6911c15f86c395ca8f8492d184f655ade87285f6c +size 667797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 748f082aab..b1c021138f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4bcbafaaa55c5e0fc42ee8371a60b8383067d9f0a9b4163ce53837883ef2af0 -size 616586 +oid sha256:f38f18e1f27965cc254d52a51e757dcb75fd11e2a35457820d424bbdb04cc1d4 +size 640167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1ea96c631e..c8ffa79641 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eebce5d3de3a6972b5a0350e2298fed51a599964cd1498ac9c19b82adc31ccac -size 661813 +oid sha256:ea31dff1cfeb039def001c9c950c86a44060fb52ae61967ecaa8f564e134be2a +size 682483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8f4726ea46..3dd6b3cf03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c26f5a849021c6235f0e4e1345bdd8753f349fd90d91d3261b31af6c67f890cd -size 563582 +oid sha256:96d5421ab75b29b92c053e816c1bc92ff1fd467df13ef9bf1de606d7373f5a11 +size 574434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0fd90b9a86..febcb262b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66bfc13700a02e3cf66ab792f164e7940df1415fbcf73e0b67c84ad9871173db -size 621013 +oid sha256:5a8e456f17dd518ecacce321a37b8b89ddf8565cf44386b7fea43b6f7ba4ed61 +size 646173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 67ab4535f9..4aab4cea06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cef365f8c372da522ff55cea827e268d8630be58abc793ef9a297ea170f69155 -size 526036 +oid sha256:2f90515bb0c5053fe9ee5b949b5abc7b43e7ce5fd9ea940418ec32b15c361ff4 +size 544586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 424694cacb..71c9549ef1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d098053cc11b86ab733c5370ce27984a05b0cb1dbb4024e692fd89465188ad6 -size 650783 +oid sha256:e2a7dfd335f227e77a2808733727a7e82a90993999cce2e06fa85b8b98e6356b +size 665187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4afc5b8c8b..c508d65a57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10ebd868178b322e91fa03213491b3be03711156a6b0a89439e4abbbf103ed8c -size 603616 +oid sha256:ec0fe782608cde534ae3ee1995df0bd1bb2f8ef17fba8706c4677b7c1ab57aa5 +size 636769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b69fae792b..3c463497c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b202dd2605dda5b6982b376983d711b6a9a1ffc3308f90f9af8d0781cdd8c67 -size 644165 +oid sha256:c811ed2f296e2a91c310ad4b87a5f459e0d70e214485aff0242cc7f0900fc901 +size 658127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 395d5680ec..7d73fdb932 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17d376cb3ae94e6d937d58b4211bbf237f99b95115cbd588abf257e06d6b056b -size 609432 +oid sha256:fee703a36eeed6011deab3d0ddb02779fdfcd5ec64bed8e6eba61c87fd9d7e33 +size 630499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 24323f7efb..c908cb299f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6eb03856f84b6a5fbb4d392962bb07931acd0c124e4fd1d51b796a408ed2efc -size 654659 +oid sha256:bed3a0666ba0d7cf83ba5235c90fa042533d1356ed9e820a0f78d5eff095228c +size 672765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 33a2a856f3..adc70ecf3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e82604d1bad4dfc44f7379326a2fdb715e3fcb837e676f0bd1dec9f663f0892 -size 556428 +oid sha256:2bf8a99e3b1a8887324fef3e6b34ff2d510c5ee2aaf6309331b7bf05e5a89f87 +size 564766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9a31f5a74d..fc5641167f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:426c4d56a2fb187bf7e8f1be6815f476ecc7cb6c4578a7b8ec7b98b660c8ae5e -size 613858 +oid sha256:73993a5553a4f9a5a470193558d5e5e074eaaabc7291478c3192d9690ae8af31 +size 636453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b7bbde3021..d6f143314a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a90005945aca4f8032d5148aff450a44e32fb56efbf2ae726ceec57c219d1372 -size 519672 +oid sha256:dc83f525744be8d93d10c2c24225c974700d7ad56d0a3a5b9eb7f5550f56b3a8 +size 534916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 30d8c45ad9..8e8e7e9e4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57f08fdc1d0f49b664bb9ad8eec24efaeff89178fad98c4d26d25406a669dbce -size 622903 +oid sha256:56310c16f4b141cb70c5e89b99b0ff9550fa241fb0c359f786dd2506e081b3fa +size 651541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 421ecf2fa9..56648bd2f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6899fb1bde47963bcca2a63d51e93ef693bf974f1f323761ae3589840b45eefd -size 551658 +oid sha256:555260e94dae1ae5de5849ec11b626a0f53fa5113f007faf29e675ae98d7f2bc +size 591444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 49eb7f97e1..6ad281267f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a41877e98b254dc2ea5c668d65373e40a6fffa014cb1751636f5c9d38353c83 -size 607314 +oid sha256:22409120fa1b72bd34149273b6db169371720a8350c79deec4517de5ad6cd161 +size 632597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d780963380..a315e88ba3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c93272f4d005b8e34bb8d88ccb77d6a0bd2bc545809cf7bdc165144521e63fc9 -size 540606 +oid sha256:09c3ed08721d365e2aaf18ba9de1513aa27201b59018e89d001117f32d2beab8 +size 574868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9e1cbd6afe..5be6b6365c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5510d966fa6b1dce103427fd25aabc9560f097495d0ec6e728ceee3fb258c550 -size 474142 +oid sha256:72e436b918aa447f1279f467ee0bedb2c06f3dcc66de9b221ca780de38e846b4 +size 498906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8d9fc4f3c5..9b8a5eceb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5280da5d680ff7f72a57ad9edd79700979b245902c63447adfd6c9739334a80 -size 439680 +oid sha256:99be4aee59563d3e8bc35cd035bebb21e9256bceb5272ad0ba0b26b08dd622f4 +size 465260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index af92a42391..33c7a1b667 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c3c28f4cc5a13b48cbe77fb8852f69ec648c4353d6bcc09edfbb11212127c5e -size 463406 +oid sha256:b62a02c0b56385e74cfaecec06df81f4a9e3dcba334ceb12e070ed0e4bd63f26 +size 489750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 12dcceb8b9..b57f1a6607 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3130f72c95fb9c8fcba05d12368726cab2eac5c0e20b284155ed6c30602aac90 -size 436072 +oid sha256:b4286291d808a6b3c86fc0cb06de5e7d191602e5acb34b4c3f75c032c39fd3aa +size 461628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 176a2e4f3e..afe1a127e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83b6c4da3b96564cefee87a283a98d8b4f99b621c54a213ce84189f0a5bdab93 -size 619941 +oid sha256:466831bfc7e43d44357e4c3e1900cc9ea72ce65b98639bace2f49a8cbddf94d1 +size 652329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 791a2d0fbc..d9b109537c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbffb226a07a771f8a46c83d0b6fe2ea8865751679cb000165928581cc22b075 -size 555652 +oid sha256:e87cd508761cbd1afae901782fe81a3d0a8f7b162e7eb9bcecf48876b0887137 +size 590678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5840fd8b75..c8e24fcf74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d31c23e80e493b75a4dd2dd390d4b21019f41504287c5fab410fecd015b90fd2 -size 440056 +oid sha256:9eb0960dc4dc6e79e8dfd0e864e2b4ad4111ffbfe14fe64652d391d667e4a7b2 +size 452192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ccc5b673fc..61352be059 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0465527be87dfb022ea9bf2662e044579c7a15001aebc5d6f7cf5e0608c3bc69 -size 381292 +oid sha256:90718d3eddaef922baa20e2e7d166e8580c67b73de5291464319931875ef36f6 +size 400532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a98576d893..b92ebf36f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc1d3ce6cc01e65740119b3002fddd274955d43eb7a947602f3c0a46576704e2 -size 419828 +oid sha256:a6f6978b50ca3b77d6f3e08e27217af9b101db75b9b3bf1581bb8dfd9a94b9da +size 433542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 77792fe7cb..887cff0e1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e31d01149d4098c016d6f1aada01c7987d3a1a608c35c7961a662a98450a9f55 -size 361854 +oid sha256:045d7db908196f995e62ab2ce884a43a1edeabecf30e73eb05102ca51b24f947 +size 383462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1821661f8b..f485525b7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea17a95af4a087ba1ec9a31ff8a1523e30d58db2fdd3e183bfdbc5588b75512d -size 463090 +oid sha256:cda93641d9f2f71cad388513557f67ffc38112284293e4cd93d31742e5722eb7 +size 482330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1128f698e6..e059dd89dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a55a4d75d14da3bd866d8a421d28a1bbd86d627b0672a18dcda995e70c3646d -size 428630 +oid sha256:b000e0b4148d7a8f9d9a320d9faa97dbfd5bc130b88488d09f3fab37d06ccbb9 +size 448684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bbe51f02f..61adfe27d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8454b66e6ecaa4c48d0218fe89caf824a2be2452eabff51d170a5f5703bc267f -size 452354 +oid sha256:1b8cb786f776a0cb39db0327007a5bf950a20e65596ab8553a06b29832ae166d +size 472384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eaea4985ae..697c6d7eef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0999d54f8ff881ddef254c6411e1b1182f5af45110636d8ad165f6d65ce76022 -size 425022 +oid sha256:a0af184543b2fd60f0c3baa0b1049e00cb542a4a8d032fcee28ba870868c3354 +size 445052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ef35454a1b..fea6fc2021 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad0cfccc3eee42ecf5dc55cb478901a5e650298b210e796be89480d8f45f36e7 -size 608100 +oid sha256:2b3918b0e4ac7fe63abbbbaed775ac55c32046a839669728470b310e313a641e +size 629635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5dee8faba5..f1819dc367 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1656619999a40062a98d99c18d219748d407d6730af7f0d8cabcc49045af1d90 -size 544600 +oid sha256:22df2edbc5f7b07de33f10d04544305d0f4b6d6706bb21d355550f445154619a +size 575680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9a4edbbe1d..1217a1b54a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:597ee48cd3b1b1c566c65d0389938b04b33f51e6865e9e6b170b4aa19cce15a9 -size 429796 +oid sha256:794a17a55e525edf715fb693e6ae60b2a84182cf4acd259cd4ed7568996782fb +size 436406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 587dc6827f..6907308fb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db14f4f758405f8f496f9e74a435875a692b8a075c042bb0af328f2d7e90499d -size 371032 +oid sha256:338fa97ad32c4e54a980d9e3966cc5d3137128abb4e10fdda8d514cb2965dbdd +size 383956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f2efd73cac..f35e937da7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8af72234370321683ae1a15279b2700140cd75c189ec53659aff85821c288cb8 -size 408778 +oid sha256:27263657def38ea1fafb576dcba188e9e215d0130a6e7d1d2f8eb8ff53b666cd +size 417756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 53062b0866..d0be2fcbda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f719717c5bc420cf92836841404b4c4486921734bc6739550041bcbfb8d4219 -size 350802 +oid sha256:908485c46bd286f97d35f9586a56705b58ae22ffc4ca3121e2e31ceaf45fbf41 +size 366886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b2e0dd389d..80ac72f9f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:528db8aa61f4e5cc7ad3376dcc857f53500d32bde28ff9405438bd091ee997b0 -size 493908 +oid sha256:d469cca1b9394951a46c4ff8fc68fd49df02dbbc8eff71bc8108ec48009bb8e3 +size 512360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 04213d3531..6cf7b4ca87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f762a75aa1446ed2857c9162d246b0e1868b14728a6dbe49c5642004a210a204 -size 459448 +oid sha256:f3af41c74e95d90a99cc60d8da85fb88e5353c7fce4e724a2b38b2d94786c74c +size 478712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ee6a8dd2be..60de7f0e79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:992440467d06091d909ac2017140424c3a0ef94ab5c11618223167e333a3b547 -size 483172 +oid sha256:eaaa949e12d3b5d4a3e5096bd09e63de16743dff467d3c675a5bb18a390324bd +size 503992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f502a2e8b..2b590df08d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c28cad80c2d7c71cf73b159abd8b77ddd04d26a7d29886d6f42f7a902f3f5035 -size 454262 +oid sha256:9261efe8df2043f611269dab4fbc98d3be6041fe1ddecd8ea62322278df1f663 +size 474290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 650e1f2645..7d65a6081d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b19613b037e3c36a2b8e733b3fc233be2ee4f5256a79220ef1e8a552c7ca62df -size 662501 +oid sha256:1a0e4c606f6c455cc0990045963111c4ed43f0aac032c73c44d398935e2d5b34 +size 694297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0bea092a7a..9b776ef940 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f7ce873cf61132501bc9d9dcd5d411db607830db401cd06526adaa49170cb21 -size 594658 +oid sha256:663df4717688c5228084d4799fd71b3d8d52deb7c4dbccf7062c222716ce0dbc +size 630599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 131239fcd0..2bc1311585 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:027dcae51dfadd8d26959fecedd98cbaf76f5707fa37f88141748b3d492e706e -size 464462 +oid sha256:6f99f43799d24e020f1b7f8130402bdc4eb23fae7ce9dbcdef3fd18e2f4c23a4 +size 486834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f7879e8971..58cec72271 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:936720e36c58f85167579057e753df5cdf0e3594210ad3fa69c26aa03a465c66 -size 394622 +oid sha256:6cb792a61669ad8359b2567a77b31e58a0a2df6577767e087ceb1f365b962829 +size 404390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c5499ad59f..dae50f74c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:359c182629dde2f03c819f87c06a3e79c341b307b4cd32f10d1a45b63b692406 -size 440286 +oid sha256:25be7a3c0b52b5df7804786e00510b71e3f0d4c334a9efb7130bfebabe457eed +size 463448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 515d7f67e6..b883c9a3dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6398f354426a0a63130a5c70ad4a269ab43984244d818a78c042a19076bc7ad -size 373604 +oid sha256:33fb54c327d6d9d341e854c5d5ba5ef75116a3b8ff3ad973390150f6baabcfd6 +size 385740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cc5b6a5066..5b4c0e656b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:058304ae26d40bb05a4b16a924f97edaf7b85807d184e52af3e8ee0a0b0bf2a0 -size 482858 +oid sha256:c755e4e683833e7669a89d25f6611c0dfbb31f09f7b6fdd42b54b25b4f2c1acb +size 495784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7bbe85db60..14583b2e37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d3cdea40ff91bcee3b9fe2c826cb01047a424582301f0949f681a8f5b84d159 -size 448396 +oid sha256:b3190e8032a26eea81515de2cfc5f8365597a39ec3f2b05beb84b4a05851a3cc +size 462136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 423b9a9507..ada7d5d8f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5322ac84aa82b01cf5a119a1df73daf0c4b55b58f0b9e07508d35f9ac4ded6b1 -size 472122 +oid sha256:d1e335ee99423fac36b7a93bcaac0988cd3b2dfc3cc1d237b77c205a3a042dec +size 487416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fa593aba1f..e22fc828ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c5aa9f901ef5f28dc7715b73b8514752620757c28febace55462514a6b00245 -size 443210 +oid sha256:43fe0f4d770d725c896de2b46e0676f3e1ffee35c0aa9e07d9ad6e4838e064df +size 457714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 017c90aa3c..0f047d6ba7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40348ed9cb2f393749c346aa1e7f3f437ea2093a82b335b7d23411dcfb221ee2 -size 646911 +oid sha256:339f2af42ca066009e88942ba142649fcfc2fa2424c378c4253180889363905e +size 675353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a11dc45f3b..aacad69166 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5989a9efa6d30e5639829b4d5664dc0108e8ed68dea1088be5a936febf4b256d -size 583608 +oid sha256:85c19183dad3beef33fe21fb76da3f72890f3a0734c93311f51cf8e3c7c802de +size 614022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 218e853e77..6816be00c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef12f5aa4ac74b4c16d7f576e551de4887c24f9fb07149dce86ffb27cc1fea04 -size 451042 +oid sha256:2744a18159d36ccae0e4977865dc53d39c84db776d80b28db2de0d84b0b6f4a4 +size 465522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cd440cc208..a2d5c9ec44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f41d2a2e3a6efe9c1f143b5ae1bbb741333407130e9840f4abd87025627dba6c -size 383572 +oid sha256:5b92a24c39e7fec7cf44bae64fdcd75a78113dd0e5a0e64632aa2e60f2de18b8 +size 389392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a6a247cf8d..4022f4708b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bf9344d8d4e356fc956ede67f607d992a043a3af784eedde1ed8b8add501d51 -size 427656 +oid sha256:2b8cbffce676d9906b434cce39aa12d2e2a63cc54bec5810aad3784cd5f29ddf +size 442136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 36ef7fcb64..4f89b72bca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca5f27956766420d58fe0d8357be06d4ab90c30578b2c10e7205ac8dcccba89a -size 362554 +oid sha256:408a233dec6a857f9334ac1e26308d79c6e1fe2fde6ea68bb331a2316826d592 +size 370742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b51190dec8..753efe4804 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b886263552b01eca22446083100dbe4fb8c7c22f413a5583ce46ecc72963efc7 -size 649347 +oid sha256:1484bcd81645a968d2eb9ec0ffe03d3454f0543b7478637b6597b73840b28c81 +size 695399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 518367a090..a902017051 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:614591ee54ad1964984daf36942b05a497850f57ebb1c70af98481a59c140cb0 -size 560636 +oid sha256:bf060f84ca6ce357bf393524f6bd53448ea651cf6120792373db2e80bebb5006 +size 601312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b837b72874..81cbe17f4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5980372ce13b8682d11afbd423606d5c314f84ef73f1b61a91228c27136f08f -size 626455 +oid sha256:f15e0604469712d15a805102c5d3cd380ec1edf8b013cdc7334a27638cf1ceb6 +size 656721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 7ac919ad7d..89e10fe878 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a7774dd8fef394ff2b2cdb813c6c7cd8d31915601490aee5e8422539a6619d8 -size 539324 +oid sha256:fc1dabe4e78f174cba5075865052c54b6e99fc24f5197cb56dff998e30c07881 +size 563424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 051db4ddee..3e31d485e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:170b9f436d5f035b80d03a3a1ebbf74e0b26776b6e8e034635ad113972766ded -size 587806 +oid sha256:b528e8daa17f478743ec8224a74d0ff90ca6d85c4958049bf63e7a974fe43426 +size 635463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 46c38b93b4..5be670af29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b5316e5a4fe8568a136f7d661d053f679833b9a6f5dfd5c3fbb835cc86d11d0 -size 558080 +oid sha256:2b10a832e37754503290c6bb69ac54995f99ee732fc863134666b685e0a5fb33 +size 607340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d36ac8badd..f7ad181c4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c048382c54eea6e5a23b6b7f32045c1b56c5e030223991fc60d22e422d9a81cc -size 574702 +oid sha256:ee9c6a033b5f81247ca7024d0204e4a71cbbbab1540c8985f3a949db4325c705 +size 623937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8f8856b6fe..96294cde98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3ac93a7017356bb25e26bf4636101f6b66ed3c49362619dd0ed65892363eee8 -size 549736 +oid sha256:806338e3051aa054548cb130e9e289e722a2f2aab2792da77ae57d97cb42da50 +size 601340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ca235f4f7c..25b90b1459 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3094fd70c6480ec5211f5e95ca90f2ca4cd382315ec75b779559f0fed3083252 -size 652501 +oid sha256:9889b4ec75a58c267dd0bc3e8722a040af69ac8e8a4c617fc21e5fcfc07a216f +size 700503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index aff53f00ca..8517d79274 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46f59ba04761b1500b0c7c1f24d14013e3f28c3bf5bfaffa2d079f04760ba63a -size 596524 +oid sha256:fe755063049f5b6a5f6e030c4fe52cf4eeaf70081106d6bc75eddd830de8338b +size 651581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 30688661ed..e62ad272ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21e1650420f961527554e708004f1cc2f38386a9ed4637b8bfd46827caba9fd5 -size 547406 +oid sha256:6021d347bba365c352afeecf1da46724569a3f95c1189336bef9bb96e441624a +size 580064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eadbb384e1..39300485df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aafa0a87d6332dcd34376b031e09cd3ea221f0cc496852bdb83513d72d1e5b08 -size 490220 +oid sha256:9f75ab84ab35d03d94dcfa7fafbc737eb0f8087f4d0d0f28fff4f01651c06304 +size 530772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cccb02c258..ef254471e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b381e28f1828676792289de189f7a4665971e6e8bdc389b68db3d1bbc01529c7 -size 526388 +oid sha256:d1fb1aee4bd4db59f2a84084e24ce80efa001f4485ab60283add863082fe38d5 +size 559836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 279d439f9d..40896153f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f25a350c347903fd08bb727f90421e012a74b510c8560e3763d8f216063201d5 -size 469992 +oid sha256:0289199a820ec111ebc5e4e35c4b0e25c4daea9ba31648ada302e2de4ba5f2f2 +size 510544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 51e8752402..cc790145f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69be664c151d01b901526ef6778385980bbdd308912fa52f5e0d27b4a66ee41f -size 566494 +oid sha256:b8cef8397ba141f01c9d0b6c12903e5fa5f6f694082c23a40de708935450d7d6 +size 597574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ab23fc608b..6839e4b7a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6b899f05555a58be489e70cbf8df1eb13bd99fb8a563e594bb6322e7da15ef8 -size 536768 +oid sha256:9d3d15637125c36b532824aee8a67c97772d2f58ac7035f38f7448070c37ebc7 +size 569452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2dd3f88e88..5fb4f434c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6549754e3537a2b7a6983233da209edbc260b43a3277483859f348d047c6ef04 -size 553390 +oid sha256:2a7c695377888b5d5ec9c8ef6bfdf5e2cc4f831cf5c894b14978f2d722ef6ef8 +size 586048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index aed4ae7771..eace0c3c05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9dfe61cb641f84f5b79d45816cb5d38fd2df947259c7b3d2979524e0097e2484 -size 529214 +oid sha256:5d7b523a898939eecc138fcdc90335de1068da076c0470f67bbb05d8f7d40213 +size 563452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 34dadbde06..da7877f89b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a424745891d74ddd37958fb5b0c08006ccb1c3a68782f3c2da5a05467c9135b6 -size 629611 +oid sha256:1da58a80a0896b2ba6f25cb7d681c70413f9875e6b16fa553079d60230c0fa9a +size 659877 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3801535e0c..8adcb137ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eef93912f87e5df1f346e7de31ae15bb88971ea936b6e6e38572493bbfb9307a -size 575212 +oid sha256:56fd9f759644e959638ff9ed2dc3172415eb313b044554ad13ccc48ab051e157 +size 610904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 82a5ca23c8..0b6c6be405 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2add5e033cde7d0f305377eccd927474d874e1531ea95beab38e1f3798c20797 -size 526094 +oid sha256:d324c7d8761f44d0e6fc08aa59b85154577d656aa86fb7426b77ce922b5d43ee +size 544544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9031e3d78c..1ec8c8cdb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f73b30e54cb6541e11a135be4ffad28583734574cbb7fb55e79a11dfe81bae0e -size 468908 +oid sha256:34fcd7eb396367e535030c3874d40567045ce00c1cd3ad2259de05ec3e702ed5 +size 493674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 333352b262..79448a1c80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6687bf47177ce1344f3d5799e013faaacc4d0c1e647a87b0a727f6a8527c7d83 -size 504286 +oid sha256:d23a9f0150b56e7e3f74826001a3cb933b4f7067f85e2d225889cf31c45bcf3e +size 523526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7242abe845..9b40fc601b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a5917519fb23d1484927c57ea2a6e0d0c1d5788fa910a442b44596fde2fd6ed -size 448680 +oid sha256:d8c446df9ef7121a6f439f49249c043165a5be2afc9ece3ad8377bf0071c34b9 +size 473446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4ec36dda4..200f624a9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32ce943ecf523ea41d2c8022a8a7d698c5bd3873c1a4f14773ba0ff9a5ac8cf6 -size 607572 +oid sha256:0a77a40c1ee50875352ef4d8d4c1eb4321ebaf3dd3d074d45b29b9a8212c94d2 +size 650493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f58e53bd8d..db33e78b2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:616c5405f624008e488caacfedd13771f180d0b5cb6cfcd5d91647cda24fbac5 -size 577848 +oid sha256:4fdf6f3ccf5ae1fceb5f19382c370e842f9a887536f5eafc7354f221eda8efc3 +size 621583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4f3e3ac397..6849116a52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f2be3a913d49da48cc183263b12b0c54e77ca0b7a30a0f27fc078d364ac4e31 -size 593680 +oid sha256:b6caf9d70f379144996dbda05936702e35b5df5ac234e75c8bc952a4322a1f7c +size 639757 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 11d62ff6cd..2a04425140 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d1e13054afbf7583d5873db85599db6430ff12b1e2bd1395960021824a11229 -size 568714 +oid sha256:3ec6196158cbe9d355a31eda573605221ca1232e3db939a67fbc65b9a1963387 +size 614792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0e17e1a79b..e2f5a5ea46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f339a89e9f9fd6c3baa9329cf72946aba2e85fb1e805a712e68bc05ab4a52ae -size 691311 +oid sha256:07f379da9f5a09951d23c6c1e745f862aa0e212ba8fa9c1cc9ed63c4ac1e7f8a +size 740891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2b8d9549c9..669fd223ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cab5fcd991598582e172e90bb1bc230fe77851ffffa0526f6e9c17f325e8070 -size 598902 +oid sha256:9511da697fa0b082daa0558c80a08816cfb1acab4d797f8f49aae8d9600c2773 +size 642661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1e7d841584..9a39fc0bf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d569215909446326fc010b33ccf4b14eb018862a3a34e026850e0ddb8a3b74e -size 571810 +oid sha256:70067364112aad98bfd3a1982b2d935164c866be6292c3111ea1c0e5c51dabdd +size 614706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 53e6e4639a..5ef268104c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ba10fb68440f1a5b13b92785ac66d260f87c24e948981de3344fc8696951521 -size 498024 +oid sha256:80a389901d02daa8f839d9faacbdeb520efdbeff9d5272b6ddc2371a962aa5c3 +size 532262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7f923a6c17..40b7bb4854 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e95ecba444f850248c8e38f82853b89f47995f19b5805404f714a4e32f4d2230 -size 546846 +oid sha256:5185aad9182b4e78642aaf115b0148e8be7307c0f85028eed0df2bb54659d48c +size 589742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 06936f2886..845bcff5b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60bd7ece9198a84ea65c4954615ade8e5e1acc2a710d79cd41a1cf285fe7f3fd -size 476218 +oid sha256:331d0e3462fb919c2fdab184e4f92271db54a2aa86b15b13c9095fd11999822f +size 512822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 04edb18e00..1e2a5f137c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c5baa3c99323f93dcabb6a6923005b5b821e83135a4222235ff7ee86bd8e05a -size 586260 +oid sha256:53c82b399f9343e62cc1c42d93daf9787a5f1aabbfbe221bdb4d6b70ab6cc200 +size 611816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f084204806..a95578220b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:649141a4265b31c72ee3d79c89f3cafbb0448275638db7628cbf9665e52d9701 -size 556536 +oid sha256:82127c61f336a432d679b900bacc8e1a20e96d4cc907b8f309352289a638552e +size 583694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5e962ea5d8..12d4a4dc53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa0d06d0cf177bb69f758966414215f45c2f989ea930e538f352a0beed73d739 -size 573156 +oid sha256:18ca94c5803151ee1e9058c0960f8a130832cdaf96ad11fe01bd86e588e8117b +size 601080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba0e51631f..d42c2c384e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:369b41a6a173128181e8ee96228dc8b68d69fc804ee74274ddf06853d11e0cb9 -size 547402 +oid sha256:903c1edf747b78d144b3520c584c20392cdd4d69f440a25f2892c3adf2491da1 +size 576114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ac0d96fed3..80eb3fbd8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03585a8765082efe51f7b25829ee4c85444749a09673026f9c053ca0f02cab35 -size 667631 +oid sha256:d0ba9378abcde0210edb691f0713f7aca02f6a6e6e84a714a995458eccc211d8 +size 701055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e5e7a447f2..4734ebd218 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eacf2d6e85f70f5b32bec7e0038d7ae6b7c19b47a2db4a2ce134aa3876b49d72 -size 578378 +oid sha256:0a67531fbdf2365171207afbcda002830dce647c86d745ccc140d22ea7e9f712 +size 602232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5b56d74b6a..0bf8f61f3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f53be443d24bd1bc78a2c4eb58261ead9aabad3a1a79b6561b1606435a37745 -size 548130 +oid sha256:d581760e9e0f32cf3e0b38251b9b85068f729e264caa5b4215a157ea97c21639 +size 574450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d769492b4..b1eebc533d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69814c363d23db97cca4cadc9a11cef98fb4ff435bfc6c208ce7cd298bc85fda -size 477502 +oid sha256:39781347a283f5ae9478c88c32ba732fb98bf451bd25c1c77816f632b6097006 +size 495164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 671180af5b..69d61905e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0aa59a2ad05d4e7a168c6a45ed5665bc90f0efc16880f2bc6431e7a1d175f267 -size 523166 +oid sha256:8f70d34b8e24a90ab1098931582ecd924fe64cb41232fe2b8c49ac050196ee80 +size 550274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c8703f44e..e1e461bcd9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73d106ae6f0bc9137ca4756367f76c7846f37b0ec9a715b34a425e32d6603cdc -size 455694 +oid sha256:06893077258511c413e5ae6ab23bc17e866f2616bfd41fe93bb88d56e70ed8c8 +size 474934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 96549bada9..e493921071 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23b74693a7b56b218b730ff7726c383c8da45ebc794e8fccb43d44897712d56a -size 551736 +oid sha256:199264a4b4010c50fe1244a546ca51f3f01b5d28baf1d6d16b8afa14a014cc10 +size 588588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6eb5658290..5a8a0a98f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7534867604f3977f3894277644c11ec148dbefd5ca421f73261e7a1bf11a13dc -size 492206 +oid sha256:5766ea6e51097ee3bd6bf86e618d04b7d5fabbdfb47416c90585c9fa1416b969 +size 525630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 35a1759c50..c2d67e6bfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a71eff43c66babb4e1249bd1a12ec7e8041566849729697243437a6ee2b2a636 -size 537528 +oid sha256:55416b1810ca19e4c29723c2757a52e5eb073b861a7d2932ab93d5715a8c664d +size 566954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d4efa43c66..4817152257 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77639568a90959bb175499f8f9e9a41eb350f8aeff058c0e4e40cec36e5e2c87 -size 481946 +oid sha256:db869afc2251d39d75b99e3c9d48d1ef7e1e89904429c6e2c262bc2d8ea6f6df +size 508264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8feb854b9a..810cb5b4e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0344db02b9cfad07be9472ed312d9658d63d8ede5a7804873ba554bda1d25aa6 -size 447522 +oid sha256:597cb28bbbb959384f10fb4628bcc295ab4dec7de8f52f2dd0f0d0650fbf52ba +size 472262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 27e21258cd..a2d33b7653 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4c49ce3aa2076daaf374fb9ba8438e36a3de761dcb5010fc045e32a4d48e30b -size 429414 +oid sha256:926b7adf986e9f3d0e44b601853c41aeb4369730ed257abfc2ca618b4c02d420 +size 456572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ce37c39794..0e660b6766 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b7bf42e0b4316b873c5d65beecdf6d55e26467379725956ed9baea858784e09 -size 441498 +oid sha256:dc131f4c0f6095767d3d129ed625324e01e9412cc51eb143b7f74ec8bfbdc059 +size 467052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 906919d181..4bc97e5e49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c5ef3b3574f5fca65f2981cdbbae9c556340de35f1821777c52eb1ed690b95b -size 424228 +oid sha256:a60590ee5b89bbba675b17c8e345eb0b82b63f7b86e88e599a9f1f67c5baf2bc +size 451362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ab70444051..13cffeae23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56418655671f5fde7794dab2e2cc012403e76a37817c2cca07b7e73ac098ebe3 -size 548576 +oid sha256:a353737ea049d67b22a872c9e328dc6dbb3715cda13e9111ce0fdb6e82046826 +size 588586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 53bee45237..d27a909f09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c072e9d5e9c06f46b0a5ce69be1042511ecb5efa117e321dbdb3e3cca627a58e -size 493858 +oid sha256:930236bfb8c421d3c5445c8cda93ec10ca0d7d8a88bba7db058bc1cc43b474a0 +size 526492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 053acdc65f..8ea898a82d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a8398c32f1cb4ef4e2315334d336dab4867719922910dfc886f2776e2d2a2f6 -size 429200 +oid sha256:28f4a6c2d7139486a85f75a57596e2820c27256f8cfc2e0e1805968ee0ad1d59 +size 439756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3b10ae6962..0dc441b8d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69a2cac813663c627be6e2c309c56c377012b40f8c0d936063b300d4c9ab0e54 -size 372014 +oid sha256:d70b54731c845de6cb29bf44bf72f4779ce0e02898ae6f26ac977122e579de7c +size 391254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a34e45bb51..a3552417a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8709ef48384434890b1f836258cc86a3e485335e2189aa69728a195477241781 -size 408774 +oid sha256:f86a2ec5f715c9acc5fd4cfdb918d856945e0156d4143e37be60c196aca00cbc +size 421698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 25ca61a6fe..b40f051eeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7417b3d0cf0d9f77abb8e0e7238d554a3ad407c8dd63bd028f137b52be6f40b9 -size 351588 +oid sha256:56f0d93bcf22f3aa1d32386ce22217f5867c03f4162b58cf10aa86399df52533 +size 373986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b7a9c555c1..59c3ff1076 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8d325ae432a93237e7c5426baa96096908d9974e89787f73252c2f250121d77 -size 436472 +oid sha256:ba753835286e600ebeb9e1b19928c5ae6be7af5affd823a58b66d06a140ff296 +size 455686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8cb1dab996..bc57d2c324 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5159a1ec198ecd5fd718c4a46ea94e7114cf586bc645a0adf0572c0a658c8d5b -size 418364 +oid sha256:e420b50dfbd8f4385b25c980d421971d755ee21032c748ca434bb69f81734b5e +size 439996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2a190b26cc..96ee4e5aa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cc26ecc4219085c1696cc2b56055b2e52cd015b874e5ca6e531962233392d04 -size 430446 +oid sha256:c8fa1214b7cadb2b4146274d9959e433c8c07122717550665d49315222caafbc +size 450476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3af33f80a4..5db8ae5fa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a863fc5e2e65d51a50a608e59c8954b12085e917b0974c90fa36a44eb68d0290 -size 413178 +oid sha256:b7fe3cf10d04c62b6eae442917cb4dd31f7f9901c0d3171799a297279f0f474f +size 434786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9ff716b3f3..0c5cf9aea9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08176763e6570fabd766bf891370864eadd5eb792ce50cbebd115dad57be977a -size 536736 +oid sha256:36aaa0c2fd57b3514de2a7eef4fc9d530bd69058c2c70418ad7e0c3d80184daf +size 566164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index bf3a7da9e2..919e2f6220 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bd6d7343fa6380f324ff57c45db12b18af8c60eff4d4221c5f5f5ed5b8ed3e9 -size 482806 +oid sha256:eb4756fc59ffe3990cb0af4218199a24180f94310784153d19e15ecb098b4c99 +size 509126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5938116afc..2e1493590c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b723d8a7087244914050f796f68d2d365966c4e44ef883703b4b0ee3c425373f -size 417360 +oid sha256:6e8d925c5ff36ac58ddb21676c15ab87bfa5622cce8b1dff10dfded49c56bce5 +size 423180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c730b02cff..0bd0742a31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b532bdb072b8ae82469bb22ecef182c7cafb12a1ca847687986974d566bcbc8a -size 360964 +oid sha256:7864cf033ae6645fe7dc20ac44436da0c7f599b949b8096b8592b60404f5f2ff +size 374678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 30b6fcd4cc..a609073343 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e41f1732d83f9daa8a5018ec5304682223b735a964d38e335427b174e72bea2 -size 396934 +oid sha256:523faae937aab65019683c7d621f25c015c871cc5a2fac7083eeb0befa6e2757 +size 405122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 476b01bf65..d0a358ed1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc75ae4586f4a5d3b35cdcb6753da3f25a40a2dcd896a73c9bb2cad1f703edd7 -size 340538 +oid sha256:2efe73d103bd33800fae06c81435eafd0a32a9b5dcbdda22bdc4eab2b81277b4 +size 357410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8ece40382e..d737ed5084 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e029765554afa36a417517b87ce2d0469bb802fc993fe667e7ef474cd5ce3bd8 -size 468078 +oid sha256:e491c77b84ea43da36dbf907af3b182eb9cfe099bad55c30e13983c7e661ad97 +size 486504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c730d9d907..01c21d8500 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8ead6f7711a85233251a912da952130b7daed6d2765a01231ee3328a390652f -size 448392 +oid sha256:50a6d08494040cc80bcedc1e81d9062fc9fa086c4400329195ad3d4066042cba +size 469236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b7c0fbd018..1ac222301f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:597b1914ed801211cc26d5b69d0b4eae6cd01c09e19333c4605cd1442f8cdcca -size 462054 +oid sha256:695859685e0e26ad0de9d2b29a74a974345477909e259e41736ede5401f10be3 +size 480504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e4412cc69b..ec4c2a4895 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:480a593fa3b0e4c8a1ac5b1617375381a729435215acae21135a09e475d1a86d -size 443206 +oid sha256:2fed9b0467f2744b1c2a8f4b70f51eecbf70d90c820cb8bddbb1c69901e91bbf +size 464026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 90e175dd90..0b3bc6c9c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1df2d660d14929c3d814a1713ba3a3c0f73c6f82da8734d4f6c352dc07394e08 -size 590544 +oid sha256:44e738d4e0635db87478b422bc7070ea51ee555b63194082ab6119ed28b42693 +size 631343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a13b72b662..2a4a7eadeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b96d4dc68b7f5c38bc9095ccec1fb4ae0e820f66b8ca3c998abd4c99b45c1b5 -size 531804 +oid sha256:d47359ac8a47629c5b54931289991aedcde61a31c9fd802e7e1c1fa1e66c094d +size 567596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6b2f58c5a0..4423894101 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc108c676e353473f5020038d34f2d738ab77ca99212e2f2ff6737d693bfbf48 -size 453604 +oid sha256:859bad070ee7a082874aa303ce6fb04e598a3a941160b576a3f23b067733cefc +size 474398 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index db1b972a83..68376af445 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eef3cc4a895b07fa25d485a25b64fe9200f08a43509aad6d7cd50cffe877e977 -size 382976 +oid sha256:ca3411311098ffeb5441fb72abe11802d428438b42709a3993705746d8ba7463 +size 391954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3073982e4f..77226d419f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdbf75725eb09812cd5e9ab158b01a9f6253cc2371fa1483f74c69a38abdfd90 -size 428442 +oid sha256:23c458118fb72ea47a08d23d5036d165871070d077702a68227e6f7a7246902e +size 450026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 47df890941..611f1e91bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b548bd3db135d40680107376f61e3708112ba5143c3ba293ca8843c5ca3854f -size 361760 +oid sha256:d3553b844b10a0fcf791cda873fe33c55c0aff9afbe815496e3f84c1cac8ebb6 +size 372318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6e9aff04a0..b581f4bd3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4cce15a7af1768d3057e107ef3565cfbbaab787416fc6a61897568303a959aa -size 457028 +oid sha256:6d91aa9804510ecb4a4cbe1738dd74495dec81743274a68ef4b2820acf3b35fe +size 469928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5927156f55..842ca712f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73e2f3eb4bda6c61a37696e4ad8a4aa46ab44c2c09097e578480d6906cb38987 -size 437342 +oid sha256:7dff6f860f14ded4f82a9bff4bfa9cd4f74b03bc21eae0763c005c1f1a945020 +size 452660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c719521ba..84093f4dc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:398c5bc760fc54905b3d1f8eb36e24e57a355f4aeb0bfb283877939ca68b2a26 -size 451004 +oid sha256:e6984c035d21b487d4b5323b2ff0e736f20fabaf40d233db2ccdfcf239f36d29 +size 463928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 65ac85dcd9..5e71050b5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82e3b8ac5cdfdf060d697850d0f62bdea69452dfe707b06ce9aaa05a4fdc536a -size 432156 +oid sha256:e236624de72b502f7c2c36f939b9808b87c11d90aff1b2eb2d1ca011c06900f7 +size 447450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 755af78334..263f91b562 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:787b0d587cf05259368beb7d8f746a92a58eae352a1a2f9367851075c4d261df -size 577124 +oid sha256:a641416f214a6084414c52a8d54349ea8e89447b78cbd5de634b476abca09dd3 +size 609710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9e9b208892..ca27a0adc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a844dc2b5084e59ef9179fa80e8ff8115a33f1786c1e668d3a7e6145ec20c350 -size 520754 +oid sha256:c5434eb30781bb0341d2ea3cb772bab4400cf480c747efff739c32a89fa86542 +size 550230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0593e59652..8a5d8d9c34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56028e80dd5b0b5846436cefeb6994b2b0b2aae0282ed8ed4f6afb719e54dc6a -size 440186 +oid sha256:61276221e4b377645cdf965bd5d78ef5814155f0ff00e46eec04c68b477130e2 +size 453086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ca3e00ea68..b3ce648ee6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1df31f5e78a316107dd4c16a3e6c35972c45f3c7c51ec64640b5b17b92380472 -size 371924 +oid sha256:d987e4299b8e0db81ce9918257e12ec42686c465306bbbac5aa9cfb324d5c5c4 +size 376168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 19a8f1133e..ffd927c0aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ae50e9c8c139b298f353ac3840d24b5de9357d2a380031ee7f9ec97a7618dab -size 415812 +oid sha256:45b005ca51f406462c4ddcf5638db48ca7b38dd7237d2f059dc007a4c1aa36e9 +size 428714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 257beda06e..184b7c487d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6de41dd54fbfa28fde081eb1e90c5ac43ff047cda7dd677f2a97c7aa8f1c9533 -size 350710 +oid sha256:ee026bca6f994945ce4b9df95b8de728885913571ccb196889bcc6e0cdfa9c4e +size 357320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8091f58781..dd013bbfe3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:760aeb74af8f5327e7fe13f6c5b0d3cdcba0d94d0757461148cc6732978b61be -size 690669 +oid sha256:e8603f47404a939f550b0e5e89432fd3035cc07db9c2ebab7f736743a520edae +size 709267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3ae272f6b4..6529be94cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04e2aa637de34880eddb4f60e4347066a9f91bbd51952151c9f2faf4e8979a56 -size 607928 +oid sha256:5de627bd6b703c35b6e80816d0564a0973170a80ee8b3ac88cab57c597d80892 +size 624407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 77c5553985..4990d27fa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7895671443e70b528e2d7c16ccc65333acd36a1a586cbf348dd878fb564d4c57 -size 696143 +oid sha256:2aa95b30dd2a79f3e2b9a49eb357b01b99487cbbe90bebfd2fc536250b7d38e1 +size 712621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 058a4050f8..6e51b06667 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cbe88af6fbe07c80d964df3403862a115533b13a07d58758691f5b94f946fe7 -size 614536 +oid sha256:fbf4fd028462f382807c07ee960a3e9e18e6f725464ebae38f18d661c92bb79b +size 628597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e576c442ab..9e999a133d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7bb68922112e3197fcdaa4fac3c6148b8c4b983a277b80f195492ed2dd00038 -size 758535 +oid sha256:6b675597278450762d649e84426dac38e7920ac3429843e7c401ccf4da59e599 +size 775703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 476ece3ca4..10ede0eeb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:112848d77de06e8edfddd0d98497faa23d036795ea59d2267c1c88dc05508b12 -size 677373 +oid sha256:7da541750640cbe3d3c607c91913835498c4e3f020ad985fd32e84e81ae0587b +size 691827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d8113cc026..58e395d822 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2b2129cae8284776a140e9e956bda11bb6957c8a2ac0e707392ee612e1f2c4c -size 789781 +oid sha256:77de1791d3e32c45e70da9b0d28f52868685ad6868ec774d79d192e5519f2940 +size 819825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 31cc083acc..517308d6a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f087df0555f4b763bb1539af051d970f77cb95f4b6e0ed1450eb62bb923d6dd -size 698705 +oid sha256:c2765f977ee016b6946d52dd56667802e679a444d064a72702b006283206430d +size 728551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8f1cbe8ecd..8132329721 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7396b7753afb23490adaccdd49a67d0e496c7f7215a26893ec4d6b1b6dd111ec -size 775525 +oid sha256:20c8a484306345c22e8acab97e2b392debdb648d5caca4b9739b1af9386b9b04 +size 795109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4ff10d5527..58e3bc9933 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:838378f94b638f0878e7d17a82728d272ff5ff734d43957d903b77c1de33ecd0 -size 685237 +oid sha256:c9cafdee1cee3462d9130320b9dd16a846c0b83128099d2b20031052515f9e21 +size 704921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4c86e9d368..8b3f36b922 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09b75b3c08d97e1f802656768bff69bfebd985fe3499426b80986831ad5bb9d1 -size 792395 +oid sha256:6ed57a5cf4e3501a42a7bbcfb31abd8026c8247aea2c327658a011a215ad4fc3 +size 821995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 63e6c954da..a407348f63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab7bf81cc5cf20b9eeef41c6aa15365283763de4b87d521ae8f9bb358e2dcfb3 -size 704623 +oid sha256:1fe2ac62aaa757598c41bd410f5af8fbdcabbde4d6b2884abfa949b11268cf39 +size 733631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7f1f6d24b0..a3b8e50294 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88c4fa0a3a81c4d6de84f0e08d6d332dd1e58fa6c5abbea7cdd1cbe088b1b745 -size 778087 +oid sha256:ead5385c89cea2e15e111658165accba6b9d1705d6cae132c4248e72f717776c +size 796587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3fde51a988..3f355a9c7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7df2d6718b23a3cfd40d05ca683afd563009bc907f22dae4e2439ea6ac7757bd -size 690315 +oid sha256:9bdb571547e4be060ee5fc885a95798a7e6a07b10aea51360f927bfcc9c8fc0b +size 708371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d0fd459a7d..f8b41ca39a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a945e6393abbd1634ff65f635a78d2cf396fce390736f32c42e4b5126475813 -size 860065 +oid sha256:6b6c085df62ccacd971201ea99af5ee1d29626fc74a5543ba553e1a194001aa4 +size 889269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 175cfffe62..da7946b501 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80b383314ebbfcb97ce9801b4a372412fe63b0ccb2dbe1f9714125c47d1f17b0 -size 773131 +oid sha256:46d12bed2bfa9aa8d0b73cf7a33c3f0b23f77a6c47521f5de0212dfbbb979f1c +size 798093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 49d5ac354a..fb8c6ca3c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ac4af2e21236c243cf8c5e631d105b4b3f6cba9a7509f7bb7539e62f6c162e0 -size 845757 +oid sha256:c36501fb0199824a0f5d40c07562dce121d94df685042190c704aae8ac69f529 +size 864603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a9a3b2668c..b522571907 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:827c85e660f0aa839db967aca6baafbc7dfffeb18e1df0c03fb5ce96334c1b15 -size 758825 +oid sha256:1ed889cef762b034ecc71d64832b0e92f146fdd63d27c691f9c0c37960c0348f +size 774513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f425d54a1b..ebfcfc306f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1523537bd4c7d6e8fb04c1390fc25f7c86bc964b897a27db3bcebd90db4fb76 -size 648835 +oid sha256:0fa1b43f0be13d6495ccc5ed5ff02d587655e2b1d13b620e18b6cf89d82ef870 +size 661563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5de40da780..e5aa2aa745 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48a1b5a24df0cd0c32f7646a0fb1e168dafe740f6b9acde6fec1915e5f8f3859 -size 550898 +oid sha256:73a3578d6ed48c92c0a27981ac2f7edc5686c7fff5387608b57fc01c14b65b01 +size 558150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 54b8f9a0e8..ca5aeffed9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6725aa77ce6f588d40386eab638d7f9bf8e18e73c9149df98e4b75d70aa28187 -size 646317 +oid sha256:1552da960f9766d2a74d63e8a88ccd47f1c5e3d943bdf29c47de70a4f8c014e7 +size 659685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9befb8e0dd..8cb8d0f5bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:658b9cc00c6ac1cda9c00f632a4fb7db6c0acad64db5730e7cb0a7ccd228134e -size 565500 +oid sha256:2246097d92923c6a0127a22cd58cb3c764734451fe8829d8a201b8ab6d993b15 +size 577586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d5bfe3292c..74db97d4f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6f746cf81d0715f7738024229c92180036367bc53fd96b1cc45bfef89b8ec3a -size 716799 +oid sha256:950e0b8e6cc16e89e7b62686daebb1dd6ed1efb8e4bd57ca5234202c6b88b828 +size 727257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f58b9aac3f..11c1fd0a03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:781c53c7f91a63533360591e11d9a1aeb1caeebbc6a79bfe41224532a82c2380 -size 619159 +oid sha256:1c132bf82b3ef7c3d6d824f6d6661197ce349a56de6f0a1e9ff2b658f7354340 +size 624833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 53d4d4ece8..70976a6475 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ebc9cd77edb7c5b532e2db8da5a63bac9d5fa5796cce9f0f30ba55692f87bf3 -size 766941 +oid sha256:5030d38920c30ad48254d5081c8642d5a6da7461707f431d4099d1b6ab14cafe +size 805469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e420433421..041584ef1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee326d6e8e08d743cdff15923d0f0a0ac51c95dc588697f8b17b7b8329103c87 -size 660273 +oid sha256:91c069a14984b784d6a104e5ffadb9d29b6240c2760105674ca1d40b60c90f9f +size 683461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4635075940..c14124f045 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd35259f4ee4125b10f062ff35cac8a1d6270075a8ab61294f448ec1967040db -size 739165 +oid sha256:238a5520d8b9fdca7d42887fdba857f885f87b4317863ebb4b3c42a01c8cde93 +size 751153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3f42dee42d..e405ec2b81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da65eed78c92dfc879ca0b7524a136b7a3343cb779edfd75fb1dee6ed0c4fd2e -size 633239 +oid sha256:b6bbe3af8e826ed340f4b78feb559725045a003f4593a3d1a03ad0ba14dbb958 +size 645917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 31086c364d..9f914c5e7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:880aae9e5324a5f1cdcafec4b2aff33d6c95ef4055ed385bfbcf37fb45224538 -size 760821 +oid sha256:2abcce2065ec670253ad5a51d825813b469f97522c60e6159e9dfb2d510fde74 +size 807441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f49b2cf752..7d2bfbcb53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74e050ec152c0167cc81ab665ac0b9390b62cfbff4c0d9eb87e03281cafeab1b -size 676157 +oid sha256:ab2b262f98d466bd3f82c2ac219071000dcc6876a5a11a2b873b28ebd0216e76 +size 712959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2532dd58a7..4ebfa3e023 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35cb6010fd5a87e9260d709499948ad59b6c295823aca78e47516c3f168ce887 -size 733047 +oid sha256:636a12051f3695cce2f2cb18aa2dc8207097cc32a155052d2f9fe580c7958263 +size 752335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4d7ad6d349..257c78e040 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a17bc9eca373d739e71b81a41a6ef48980756805345702bb0a36c6a603d4e7d6 -size 648333 +oid sha256:781484f353ad75a5104d423aa0972ed9b74c10e3e10f5d991710f75dc2835199 +size 668559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2cf60a905a..341f44e5b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37d0571f474f9b8dc064d284b269d357ced8beb81865c84b92e395f9944b06cf -size 829577 +oid sha256:a23a685743a923d220cef430e98335e126ab841ab806fa318959c3bbcf4ba4bc +size 875999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index af1f3a6288..639e3c2990 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3388ec061db35cd16c6c1086462fa75e1d57550cd48e87a6955f28953b6cf39 -size 727645 +oid sha256:ba73a2740ad9dbe788b85c1943d5aeb599c73285da15831dc371bdfc3d23a736 +size 752213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0722cb8f9a..46bd2de7f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6701949d86e0a23b031afd9f6eeb51ab125a3cd12152cd31dceded9a8ad81839 -size 801801 +oid sha256:d210431f5d88d1313dc781eeffa1846d66db199d04ee62557e55e915f3737e87 +size 822769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 76482c35c8..da5816042b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:210aeefa3a21cebc7d96eb869ba7030ea2ad11c996fa59d54e36cc1f9df0764a -size 700661 +oid sha256:a5de5cd4a09a5a99164400ae35649a9a7d6b7df0ba3dffc692f4f82d7bdd1c79 +size 714671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6f5ddbae41..db5602ecf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fe6bef3d7cc4e53f4b9a134ea8793f8cd73a8a9b2ca0b2d18c4780284602c9b -size 651543 +oid sha256:8a5e57d8fe7f80bb6c9db65472236deaff846ef5f08a986c83753b99ae150a00 +size 667479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c61d599094..b7ecfa709c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac45d3f89921a7aad886934b2b9671aec5c2200d779dfc3f91c21fea2c7fe107 -size 566386 +oid sha256:59f63141dccc57497ac283b62b4a2922c98ff311cab9bed5ae15cebb136a1862 +size 578422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index cdfe69de5b..ad94a9ac30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9abaf201ca7a82097699811e12c834a09da296b03b6c3091ac2f14fbc26c83f1 -size 651937 +oid sha256:5c928a8852c63a407f50bed8d9fdc6ebc12a580dba1b3f112cfbdbf1779197ce +size 669401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 2cfd933e07..84c2380621 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0433a3cdc2cf628d0b892b56a0067b4976f53cecbd027f18a54b50f37c1a348f -size 572944 +oid sha256:ff814f584fbfc972cf8a708a1842c8cb25392dd132ed2d4c45cc762b36882b8f +size 583452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dbefa6bdf5..3825f2263e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90d1242e7bd4bab15e4fae01ba5f0c2b3fa531a66a171b5cd6f289b42c93e010 -size 718571 +oid sha256:de3e0ca924a7d01063ad76319e1d11faa7ba9aa100527165ba3b224865e302ec +size 734653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a445163a6d..97fb3c7edb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc8fc8dc3c5100f230e76ff97e5cd66472815497a28cb0677155c108cafa47bc -size 636077 +oid sha256:8724365020d48c6ac188f3ce94276fb68e3aa0b9037f35fac0cb056ab46ed16e +size 647177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4f697ee6b1..bb0aaeffcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d85f5ae4e892f9bc76dcc1761049ae062005f3cee93bd84ab1e956b6a17fb76 -size 738569 +oid sha256:70b25e26ca85831255528d55f9d384f7cf742775f9e2f6c8b4f69e91be54b578 +size 759093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 37f362f960..c3f5928e2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97e0833d6bed69554dddfbac73ad2cd8e9cb4f444b381b4e3fbaf6b7d1551cea -size 646357 +oid sha256:06a06904361d79f007de91309ab274182a7fb84951f87398413519afeabd673b +size 667177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b87f894851..2eaa120db5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1a9d85c1906adee80fa45bc1887a4684fba945cca9a77b8640cc7e094ccea51 -size 731417 +oid sha256:188badf0cf4d059da45f4e575853c924fd4a2aac01e623e05f10586fdf9fbf6f +size 749373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ff48d5c5e5..9b1ca6b59a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab483c3aa078d4c39dbbfe7d976feb7217d20649d2aad4546270a4d2c4ae9d30 -size 639205 +oid sha256:57fca7d14ae5fb3ffc4c167677be89b78355fe66d94d13b8fc231902b084d75c +size 658297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1ef5471aaa..d2c7064e80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab3a7922a0b114770819833a48c0f21b91e2a8d687f7b2fa3a9486a0a5ad6835 -size 735953 +oid sha256:994e4c433e1c67ecb1e473faad1f160a496c2869064fa4ac4b5e40eeda0f2774 +size 759435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index cf90a20d7f..419aab345a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0bf98b6dffaaa094b0ad68cef8db80f4e3e51924309abc4f28d6b341ad69e4f -size 653065 +oid sha256:2b48479b1e74c7feafaf4e26dcd3284aa1f9a996bb933f70f09124e8a1bc0cff +size 671367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1299797bcb..0ad96cff74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72759a74d8df5e22e856c3867f32e4a87b23499e0436e11886474b1d53318669 -size 728947 +oid sha256:5ff27eae6529a38c8b720c6af5fb0701c41e26f725017dce58a266e34d0c3311 +size 750359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 32d9d7783f..e3ba9509b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64dd93ba55dbd642d30a94b51e5b2dcbc5242e295f2cc56ee3617ab514d43fac -size 645911 +oid sha256:5dc1546918130f85f35b2cd80955f46e36c30b64281500ddb202cd7e89c08663 +size 661747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 58dbc34229..ae6583df0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c63a37f832ef1a613cba017b5871744647d318e291b19ad55d5ac478beeee897 -size 808359 +oid sha256:98430db05d98a1aa52cda0fe80fc44ec44ed7be8425f1e3f62d39eee543efb03 +size 828141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d27f7beb0c..eec5dd8fcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab62fe832e5fe573d11e7f09eaf660faf490238beeaa558710b9b170cc01be33 -size 717677 +oid sha256:14531b6c17ec208f08ae37f10cb7e60c01993e1004ff723931e8cd0c904e4977 +size 732723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cf57533a37..aca39a2055 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80ea545fc61d5eb9fb3b616d56926e32c88cd5442b387af0b5c41da19b9e7e42 -size 801205 +oid sha256:c9636155dc96dafd33eb672ef968bc43644e89393299950df469239dff4d55d5 +size 818373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2551fd80c7..79e650621f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3a3e1943f3cee108357c6cb4594f4bf075bbd464c28136f53b5a07b80cce6c2 -size 710523 +oid sha256:ba6658ef3a14258800088987a5bcbd48a98a75bc5d8e720339b0f84ffd57cfd8 +size 723843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9ebeaa2629..03c8dfc57c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d3933489a4ac46f2d3b943aedf956ec9c68fa3dafb2686be9d8bb6fb4bc3f47 -size 724575 +oid sha256:ecbec6aa4ff48e1abcf1c240064b535540e6f7c68b1ce95fd464a6054d5dc71d +size 747269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 429da4c6d1..458de30b7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb2c4ce522c129d9477967626e1974867697498b03998421fd906f686c465cb5 -size 634583 +oid sha256:953726658e372d65984b94ace2d193677f203272439783cecf2c21f0b864ec69 +size 656487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 46b95d4c41..ce3085440b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eacbc8bfd650940b63fffb81fbe3017c9ca6d2c538b8b6dee7e16c8902de5444 -size 729951 +oid sha256:0d8f709f135f2987de71129e3ff3bba7fe3991d4514b2af430596e87219671a4 +size 750423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9eced4f781..f126f199ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f12105522d1f1f2225fa80a52281dd4468a2836ab02c593fbb23c0ef4aa859a -size 647359 +oid sha256:2035832696ddc1e1eebca7596a5f6ace972bcb1ba8ad92c5e8303f3d08c4dbde +size 669805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fc6c76ea4d..72eb223099 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:449e7098f2383c33d770503d77eb9f2f5bf5df0345abf3a242616de7e0dade68 -size 833851 +oid sha256:7295335e6fdd4dc4677a73654102bd98c226628a152fbbac77de2868d65d070c +size 870111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 699e15b30d..d15518e37d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1b403f8bc308b222cd4d5580eab2ce5178c826a17737be4431c576d054f52c1 -size 738679 +oid sha256:777a9138ee48b5ae392e9a808b525c1d33f3493207031045ca9e16485c6d39ee +size 776123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3036808548..9d024649bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ca9ceef8f90c5f3ce920138070cd3f9ebb5c0489258584122e64b9e2ffb3a29 -size 811601 +oid sha256:e7a3df6ec470073774efc67b3d4598296209f4c77dd41baa28990b5091569094 +size 837649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 06d3da4c3d..c6e4f5b0d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cdb950d9aef55d740cab7fb820e62ca07566d4d7fc313abe3b09a9f2dbc0d7f -size 714555 +oid sha256:f039d90525cb66fb9f1887c854fd31f1548fb733404048a56e003c3d4edaa854 +size 743463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7c72e49447..5509a805cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a34c7c0682edfa319d5ad4c24c0d33e1359bda50adaa6f8c26da5feaaf959527 -size 838041 +oid sha256:aa2cae51e84e2bd21613e61decec8e6dc4501002191e3b03173c6942625d56a3 +size 872673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 956acb3943..22ad7478ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9170e919c3c939d8220beead1890af480a5c3f19c3dd83f2f1ba0529ed87ab56 -size 761517 +oid sha256:0a6f79556e276c49b388d63ff0e3cd8ed08e913bbd49066b9c686f4c98714948 +size 790081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9c03a4ed03..6b3406c8db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1af2652cfd0a09423cb3a72b4eb09c1087cf1cdaadd977bfaf1edf9485108dc8 -size 813325 +oid sha256:820260c2402e6d2883578a5e2f8302d0c4e80e7e5a62d089622aef7b756fb8d3 +size 839029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index aab2a24857..06d4259038 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a473e3320b511bfe30759957adc06756e18118b10390608b17cfca062bdafc0 -size 731621 +oid sha256:f08c5c8f2a1c39a3df58d77e9011f05c0956105d6a3b730bb505d6bb2242f9c6 +size 758409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 87ef5c7fbf..61c3cdbce9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48585587a1cac1c577d9d5d748ec5df508c3adb0d88d55fa38cb41a063ccfa50 -size 725369 +oid sha256:6a3bd2461b16ca183ba75a46686d3d8afeeffb424836c789fcf5027a7558dcdc +size 748061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8d217e1fb0..c1f63b93b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c65e460582d1dfbc6983d5396b0dde71a8cf69011692aee8455e558ba91fe414 -size 635377 +oid sha256:6e336c870d27bc16332032173afaeb055b87ee47801faf9d131095a8513b96ca +size 656491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b0341aee51..22e2490207 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e99198155dbef5faa7db3ef3002e329a4116aa632092ba189c9016905ca2c8d -size 730743 +oid sha256:99a7f66e1c4018ca1064fcd25524e2d748bd093657664f5739dc1781d6a94b07 +size 750427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index eb84303dc7..98e8f1b5d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcdfda93e90d03da19358d1079a2379e76b46f3dc4f0d2b63a64b14d124d604a -size 647363 +oid sha256:fbf9afe19d318679e25a8d838739628e22211152f6005998dd9f883a11bac2fc +size 669809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dd186f497f..9b4cae7f02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83011a074d1a072e864a0f3dcca2f7ff78e1e3ddf9ac5bc4d5a5b3e83dc32f7d -size 630425 +oid sha256:b2c061f43d4b3b84282b919d03f22e494cbf4a3aa1081f5c134cdf37b66bdc10 +size 654303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1121b0ee22..e63d8fcf05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa8a68756b62dd1f9878242c1cb036e19140c1bd2fa3dc60a6667e3db93fac0d -size 553012 +oid sha256:b0c78f734b872f8220c42a058aaea2ad4abfa52f5b3350721af06ecbf9cab4de +size 570970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 6b162fbceb..5af82ac1e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:776b7f54efeead87ac15e1b6d6ab6095206fe563a4cb5171048ff122948be00e -size 634469 +oid sha256:f3e3a8db73e946b7f48aa88b3742a5edf63990edae4a127b5541ac57fdf881e7 +size 659677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 8e364e5f77..51e29c5c28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b08aa8716c2644eb0b9b8939a5b49f7f663d9554bd3df9b95896f19a908c7c65 -size 558042 +oid sha256:28dca54f96f2f2dca8908303f1f2fe69eb620320de136a212c58dc50e83a4010 +size 575160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0db7f065b8..4674d4aa62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf374222cfbf80fffd88c742d1fbed5f6214a34a854cf9d6544d4e556a399714 -size 697451 +oid sha256:148fc984eef4e1686d6dc35de010a93e1715cf5b42b32a973f1ff7943fe337ce +size 724437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c188d85e53..4497716000 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19c849ee0972453fae85fef1bb92b1f715aa88121a2cbe152fecaf747bfa174f -size 622507 +oid sha256:14fcdedde2b56f8da5a5b7144b7a6b29b561be51a013cd3bb7dc9d5627a69116 +size 639279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fe17243ddc..0bf2312bcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:217258a853126e3d498a76bdd90f616d837725530d4dcad8e0e863a3312cdaec -size 716613 +oid sha256:b2ff9f62d420f58b69ad453b22e84a78415d83821617946752e9a84296792fc0 +size 750209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2aac0adc1d..eb40f2ecbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e601600d2720a3ab7fc5df47ca4be918a4ea3c59a7d878bb0db6840bc2ad183d -size 635401 +oid sha256:883dea7b3a53d60110c377ca5dccdcf40158ec2b9352997e815dc077c2d2d3b7 +size 656269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e6a6f98770..d865c91f9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41645dc2932cffb899622fb279ccff25c1500332285147599607d4d9769ad427 -size 709459 +oid sha256:f8505ae6894b98f7cb115e3b8dc8310ac941fc5d3f1be546c4e90794acc6bc48 +size 740489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0ae4f36668..3be0e2804e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bd240d63d05285d4703ad170fa0ee6d3e6cd57a1712a4d40d32362e9cae2b6b -size 628249 +oid sha256:03051b96467c080acd6fb6cead37545deabef1985307e53f8add801519b522a1 +size 647439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eaafbdef14..09b49164d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01dbc73ff357ebba21bc72ce82bd31b50ddf5d9f36c969c95fc9da6f73ee135a -size 718929 +oid sha256:7b9c54fe6fec9e73b196cd8374ed3cdd6fa7ea26ee821ccebd05d230405f11d0 +size 752377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 52146c4801..a722799d85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b1d2b6f1bc69c27db41f4eefd04767b8fda9e97d1950c896a5d2badc9d2f445 -size 639889 +oid sha256:df37ab26f68b1b4f1d8a22af96f0328cf0adc489cc21e35245c61e43eeb4c636 +size 662435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e540ba17dd..8c8d25ef68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a2d83f91b60bdf5cfc1abd165b1c9379b390cc43b4b2e05fceb31f23d892d81 -size 711775 +oid sha256:6c0ece162b0522e662cffdecda4fbd14c07df89eca6dcb2b8533644722775a31 +size 742659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 67de29eae4..4c49577646 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03d816f023ef9be07467900f190324b53a31eeb733675fd2049e6f83ef319c6e -size 632735 +oid sha256:2ebbb44e85485a3e5712fbb25602bdf5d5cf788f4f6132393bf7d856c298f79c +size 651087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7241472fb1..12860ba9d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:082c950f3c970802e3d7a5bb71ae669567d0356e39bdb20f312ac96db3330654 -size 786057 +oid sha256:07ff8573017d1e2130ea4cfb6d463033a998eb155837cce616fa15790534baf7 +size 820639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8c1aacffc8..c81d10df15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b15b683b53d5aa8640434aecd3a090ec7b6a41ce99dc016ffbf12aa20bde3163 -size 706967 +oid sha256:ed1a49cff0bf5e3357d06294d55f1e8a84e19c03f2f2b084fed09447a7f166af +size 730795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 76e87013a6..4ac67c561a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0856436e0d8f97af0c6455b83cbf59fb42cf75029963ab4312ad2d881d3d90ca -size 778953 +oid sha256:eecb214bacb15fccdc8533e9a2d4d26cba8c686acfcb340422e2cb4216f44e6f +size 810921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 27de247e4d..76960da893 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf13b89989862a16977760e60cecf799de389de380bd5404a1aaeb48bb294875 -size 699813 +oid sha256:bc1a4bd327b5e806813381444989f52a06fadbc6573fede8b1d87b569f3a1745 +size 721175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fae2ebda0b..c1ac3547f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e362f48d745ad0ed40b46a65716b2a0a4e95c144de8aa5e5fc1b7a69d8e71ae1 -size 623371 +oid sha256:79b40e472a027e06b449752bf5c85b326c41fa073aaaa1a3e9b4f7b631bc0355 +size 631855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 278a4eeb3f..60a7a62e95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ec2d2a9178fa8c263148b39d60af7e8f8937c5db0aed4e46712663113e214e0 -size 536682 +oid sha256:b094a507856b1c117b3c0e8669bfe2f5ca6338ab0fdf4e2e13c4cedfe18b6197 +size 546154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e3fc612fd6..e718b9a671 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f31160b625ad0f90875a73981f6593f1edecbcceedc82081ba842fd537939d9 -size 623813 +oid sha256:c4ea25fdc6145582971dfa810c976d40a9a24ada817f0e2745fe16852f0ef6d2 +size 635455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index dcd2cf169f..4c03e01d32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1add629d3d0d563e58865cd211585bf2495345f8f1ec97efcb5e21071e5fa721 -size 539098 +oid sha256:9385c5c862ecce66ea67314f37cb44c69f0cfeb52f7b2045965e3c46d09686cf +size 549360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 540b5e2929..4e85d90c0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00c2a07a40b6bf3f162a0fef8bfcbf7da91247689d803614a5958227c1969bc5 -size 687339 +oid sha256:5270581acf442f7838e2039f8345558a25d22222499fe1f1b9970b531b8abd1b +size 704605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b57b05045d..3c6266aef9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f388ea2f5895b3a60844abadbe559625d521df307331ce46bcd1e405ef03afe -size 603414 +oid sha256:8ce40f931a752cce6abc67d15aac4c5556d50372f57c952e1f078ad431e46fa1 +size 611998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4904893d42..15fae415a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5acdd4d476cf598425ff3f8bcd7ddd7cadca562c3d6165729b8d64ab0ee4a54d -size 711531 +oid sha256:9af0f4082f11ccc2dc2429ec8615ffe07052bdad3969fe7bed63bd10b4d3f869 +size 730525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 90eed69a00..9f6c4fcaba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:496da5737f2b2ffbbe5890dfb022238db3ee84293a4f764e1f26988fe8f521ec -size 626077 +oid sha256:2c4b179ed8def8aeeabfacea5c857aa592a49bd3b33ee57227cf6e401df140d0 +size 645367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8c104dad09..e736755e07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b084eb2e08912b82b0c2257b7fa29044519b14f9008c259f465e2f1f4613846c -size 698063 +oid sha256:91652be77bfecd06d6c0ca4540fb8495adbfa4d8b0f554d0db843ba783ebcbe5 +size 711037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 75261f6af3..3c2f523c75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a96b4f2a8e4fe2871c779d63666be9800036c4d782f035653381b361cffc282b -size 612560 +oid sha256:bb005e11140e2615ac781c307584b9c68e320d3be08f1b5685ad438dbaf9fe70 +size 626817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6ebefd4034..6958dc866b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:376768c72e6db7580b62683c7cc00eb5fd08fb8939d9e3ad3bb092b0bad04208 -size 716807 +oid sha256:89ef0f2daa94c54b76a0ce8fda683271a86e196d8d7087b429f8b22b91938d70 +size 737627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index e8cbf95fc4..86bb900d59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c23ea63d4778a129ff7b74d6416e13818c55ffedf000c19da6bad53af2630be4 -size 628295 +oid sha256:1087948c0f4996a465b0baf70e45551e12d064e1806556d2306a8bb0a790f5cf +size 649953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 10db7de873..4589e789b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a072f806fa3e95a81172f4a251e05fc918e2a62d525beb9e2ca7bf6014a8d907 -size 703291 +oid sha256:89ee3b2ed552dad2eca9465d27f84ed0862005031289437ba0302ca76415a138 +size 718139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 92d8aae9ee..971552bab2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56fbb9c4e7f06d272da0d3110b728677818da49cde20d37f05d5c4c7b962d1f0 -size 615616 +oid sha256:b5644fe7961b95db9285cb6a7ff8231f2aa2dd2d0d813f5b2b5ff2ad3f3fc6d7 +size 630615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 666e7188a3..c50a25a772 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdbf73127d408f34ff880959d9bf0cf6aee935f07bed85ce72b7582fab966260 -size 783737 +oid sha256:72891755605f63e7432f9a7573558a6b028fccb1053d937f5ebf2eabf8a6403f +size 810377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 929db4ffab..490970831f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2cf6a1c2778673944014e7286dbc74869de3b5500916f7a744bbbe76f232201 -size 694289 +oid sha256:28962c076114dfe6b35bcb434caab40ca16c79a7f51164851ed92a7f9d77d746 +size 714269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 917096e894..e2168056ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bfb352fda02b1ee42c539296991560564c58dfe9bf265a1e31304a3006725a2 -size 770221 +oid sha256:8ce1aae420c84388c6c1082e58da97c368838e5bcda2b1a3e6ef7cf5a1dfb6c4 +size 790941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 04960b46c7..774987ddb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe339ecc50ed556acdf4b5d037969263c5fa04197d34d90e176d6bb787f0d6af -size 680771 +oid sha256:9ccc076592e29fc977b5d513a09708f88ccd2210749da692ed9d0d10937bd20b +size 695719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b291242fc8..c5b1954eeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7774a8465466ee57fa36a5f985de6c17e72d99c59f42fb20eea71c8dc459b83d -size 621147 +oid sha256:70e44e505911b3b6c16357e96b7d98e85427a8afc993e01398a83b7f5097051e +size 640731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 31eb3de794..7403f62277 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:595cf3a0f4ff1d5699ff0091a8b4d42a338e1b59731a1d9184697c66a7a53b1b -size 538948 +oid sha256:9aa277e947443b714a887295facb79c8dc4a47e7f1400ecd295257ca8f967bce +size 554834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 6ff121c205..e633290924 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92f01023f41b8e5f77eaed6adb072b6e512ce43b69ef27d24d6d0925ce0b8cac -size 625535 +oid sha256:1b4598037422e61964e6abf95ec8d6bbfc3c512ea1729fbb86291c8ea54ee4da +size 643147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 2b9a5d24bc..9f5dd28f43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f2294cf2497da614f7439cf7d5e720e34cb46a48fda60cc18f13337e908820 -size 544076 +oid sha256:c1bd4ab37e7afef814ce973fb1d8a8c5001db0413fe1cf3dc21ef0d635a291b8 +size 558236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0367b1b392..8c08603467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97982db1e6b4a3f6cf935f94cca13bdc84138bd9733e1737733f823ee93d899b -size 688173 +oid sha256:0e45bab6f12a1194a12bcd812a02be8b6ce918e271eea5a8a0cdc604cd9a9f16 +size 707019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d31302379b..e3c41ae0dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef843787329c352b21da28fe958ad742d3ff5685623c618ce763b49b0bc40bbc -size 607998 +oid sha256:ad59f312c778c2dada83d889974cfafd56aa9d5c4001b4d4dfdc98eb8c9fe769 +size 622305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 69a2d4e49d..dcecae4c37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:771d5e0c3343e98aec65fe1783e7d97defc8182d5a03f56a31d39e254b85e9d7 -size 706445 +oid sha256:1a4d1eda10f96ecf4c0b21d533d5fe6316624d75b541d3ce89bdd977be46d8a9 +size 730323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6837ac075d..bf38ce2046 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:157bc1c44aae55424748d6cf35858763d7f3728433f2390037142bb3de71c415 -size 620549 +oid sha256:e25d1748c6beec9fb71c87d3aa89cb70f514eb22ead0b8bf827f33ad49d81817 +size 640085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b9aced86fa..8b2baee6ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54a7ad363f22a89ac8a5cb4af8361f5c469c529dc57e25c8af70548186392325 -size 699341 +oid sha256:a8eda8b9c3cb42c32a5d80fba09f59350a8fc51aa06437788cbc7710cf23939e +size 720605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 236839fe6a..32d1d32bda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6abaa342898a11964d0faf24e1b837abb2d94c0da499eee76cb6891afa8f2ea -size 613394 +oid sha256:5581cd839bd6eecb4a4de37d3896cdb1762453962399195ef1e65abe10ddf974 +size 630465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 92dc0c8f52..b4d74f129d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2484f04073e9ae6d0071a9b428257a306dec87e3c0ab29e27bbeb29694fc5bca -size 709157 +oid sha256:3c6006ce9f8cbcb5c528d0d88bb00c9460deda1c0a4fac68829742d1eac8c9c3 +size 732491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 83239a1281..8427e4bd24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6119d1fb159e8f39db25838ff5b870c50b33393f9de2c03ab6a74b812ba62b08 -size 624987 +oid sha256:7291fbf5d0be7cbd1963d96a1ff71dfff9fec75991714dae1dd9357b9a7af45f +size 643733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f1db4a1de4..d5a9b2029b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7adcf00410c47eac0e1138dff1bb9271c31fd383519066db4ffbd91c5c02467b -size 702003 +oid sha256:0751a96fd75eb568bca7b3b9692f41f0eee751e462b574ab6eeaa0c507d88759 +size 722773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 17fa0916dd..3f7dadc9b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:affa6aa5689aad76ac921203a372eadea356e1f1c2106532b61f28c0bbef4e91 -size 617883 +oid sha256:2f2befa45ea46e32e34d957b247d6e0eb485ddd461c5bfd634adbc8fb7d7497a +size 634113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bfedf2a780..64347fb73d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:651687975241b13ae381db9196701c2066fe63d9171b97981ddccdb46dcdadd1 -size 776037 +oid sha256:96d300d324755dec3e25186f19f5b8cf84fb5703c6a3fd8cbdf11c577e248358 +size 803073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bd58e6fbec..bd5c0ba0bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69ccee80b7fbe97b60725ccd6a2e976e178f2f7de9272bcee87d96abfe53426d -size 689697 +oid sha256:5840eb0705fb92791ac0fa9d2604fbc5d86058cbe16ad26a3c028a55a74aea19 +size 712093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8134b8472f..3db05dfa2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c753022f3a4cec3081823eaa63c870db4ad2a73d1da648ad73b05fe40ff4f534 -size 768885 +oid sha256:7dbffa226966f03f28d1b59260f1da3c29e5f883914f4d348b31dfaeefe0e8c0 +size 794093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a18307800f..524f09131a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a746876308fe46d0f29c81776cd663778883188f156d4966f6c65d26876e126b -size 682543 +oid sha256:5376ffb89af1ea11dbbb5ca9008eb7cdfd0466568f427edd22095e076e570c62 +size 703263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b7833588b5..4251ddd1fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:773cd1bbf088e7cf3f29743c307c3cb20abd0b713f71e244cf5cc2eced595967 -size 657177 +oid sha256:f227305921ec57b5bcd48452686aa8ab3daf738747f0cf78503c0cc5712b4ca7 +size 684953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 05586a6ad3..3e0271139b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdd156ee7fc8245199bcf2fb176d46b846dde6baeff52b3374b79463337fcc1f -size 570588 +oid sha256:6e6f36a1c69bbcff789edf9050f9d62c4934eac5e55d4db65446d0b523b00398 +size 593134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 46f9198452..9538f987ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b0946843bc8644e5c9ec89b32bc2faeb384b98c4d26dd5e2821057f4068d4ca -size 662455 +oid sha256:222e5b1cc8e56fab03201a80c7d49ee8844d898317ec82f68d67381a272b8942 +size 690377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 37bfabecd5..08ad7eb5ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:951cfbef15fc5d5e3eefd5eafd0b76f1d1a74a3d2c188c4183c9bd2d9b5c048e -size 581834 +oid sha256:9bef289a5a1e9cc35bb7a3b91f0acdfc7c84e73626a21c4f7abdcb09a5b7d311 +size 606846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ad56a1194b..ccd649505d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c66aeae3151dd2d32a0964f66dc921a11bb9474eba48982cefeff1068ea2cbf -size 746967 +oid sha256:3698ef895ef8f68ba7dabb8eb22dd113b3a4b5ab0087bf3467bc6fab284c6433 +size 780365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d27282fa5f..28b15a044b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58f38fbb2b448aee10e74e15509120e479e52f494c0e2dde17a74bc584662a04 -size 657319 +oid sha256:47e0f94ac645a20e1cfd4c47cb63ef00c5821fd119b71ab0c4aa5e6113c1f1f2 +size 686575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bb0698d93f..2b07a2104e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8add50982aa2e801590394147664738f2a282dcd0fb0ccc96dd6a432cdc095a -size 737445 +oid sha256:5116686d4ce77579ebb2e1a1f39a368fd36a18cfbc3d6e9a81f82d039d045c23 +size 767637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b090fc7317..24b837e5ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4a5c83a8f70d50e6cbf01c0448489e1d8454b493bc4227afa6b5cd591d492c2 -size 647009 +oid sha256:e3ffce4cd8996d63a87a34f3e13ae5f7b76349fcfba049fb2b9bc0f257d732bc +size 673847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6994afe90c..7e31e62922 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58955a7fdc325a60f13c8f30c439bbff297804df376c11d81e6c5d3f4865157e -size 751849 +oid sha256:627df41b60a06850f96965a19746b623077d0c3d56ad1ecf29d6a11ebfce357f +size 784803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index de3ef018c1..9cd4babc77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:800bbf42ed3c373aa774db41a66050749846786184cddd37e4028e148f77262b -size 673351 +oid sha256:4218f5d9d8fee4aed3dda213b67a240f31f7623d0b35c8f3e31be36e486a2624 +size 702605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9b2bd9e1ca..8aa8c42f32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eafa0e1d3703602dc6a1603c9d3a1be9ca3e54a38f201b19af9f0f075a7af137 -size 741833 +oid sha256:4ca7337c7998e0045e7efda5bf12fb9aff6b5908db1c353f0ddca215bd16e2ef +size 772125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index fee2d354e9..b89335ac6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f74d1bef751dd02156495e3627090c48b880c6af825831367f5320837c40cda -size 660229 +oid sha256:a99fb3cc0a5f132d52f12bd80959b17e1e827cf0c3b4981719a6d0ffcb68a092 +size 687559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 440b70f6d0..eb4109be48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:748252ab2c73558b50e6c9ed541f358039ec03f92c9a3f9f49e6c8027c5128df -size 657971 +oid sha256:9c4a058b97416472eab90d81e3f845e7188499cbd10cf55e4de8cd6608dd27c8 +size 684957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f267cb8f37..d883c871dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:047b470939e51bd1a26671a79f20e5f7cd209821deabe6e5895fbc6fb1252575 -size 570592 +oid sha256:113f0cd184fde8639c723eaefa4864d6b8f711c72e34eeadf618aa9c5e14a39d +size 593928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a9444bb7d6..29a4614c63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42f15960ae4200d8faad8b0fc35febef01638d612f4b016303d14ded323cc464 -size 663247 +oid sha256:46ccd71cd8a04533a9eed374faa6ef89bd6730914e65340cb7103b4417b83a9f +size 691171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 296b0e586f..a79d1c51ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c283fb840d5b4d7f8af6d65b37360c7e71de4f0baa12afc9cc09fad9e4067299 -size 582628 +oid sha256:5c6afb84d15226f36afd54c61b82e4b551a4904260e271b37809116d1f053fda +size 607640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 47ac33d66c..5d3151ee5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edd69751c952c5244dc9a5861f623de0dd7c1116caa6fb1834b85a485e6ab96a -size 650891 +oid sha256:b9fcd25b4c7db083064965786582bee4917a42931e47a2362412961fb6993d10 +size 664901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ba206a4cc5..039e436db9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adc0e7d209eb6bac35144e9a030295723dccd3ab27a08719b3f7db61b6154365 -size 567360 +oid sha256:db8d5016a94f36b26a7c2d11144629b7229104550f81e5052e52be2a64b3551c +size 579694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 1dd62a3194..d432abb02e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf9eec7290836de927ca820522c666fa9bae673972b1087226a4fa5caea1a7cb -size 657005 +oid sha256:bd9abe3a6de2ff3e58c45e883a6e34877badd4fd2bc5b13a794aa4ddd662b63e +size 670573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index a0ce1e08f9..5bfd6c88a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56eaecad4a5aa0a0ccfb5eb8652642eaf85d22272aa89465b4ac602ba2ef32fe -size 573180 +oid sha256:603783d3fdb4ff66142d7b661fd8a562b25909e02c0d1004f85f8ac32e3c8bf3 +size 584674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 988a654242..bb501eed29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92ba95931565a5249ac111affdaa3cceb9b7a63c5d545879c5c95a9fc375e9cc -size 717917 +oid sha256:497ec4ee1cfe8d6b5aee5ec6a1e643276a48d810a086f521afbae0fb833d1a44 +size 735183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 84a96395dd..2ab7e5722a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acb02e001a45c4ff277eaf7c056c37d7071d95dea40d989e99e67a9811c4eaef -size 637199 +oid sha256:51c558a027aecb3cfed7ed0ffe9579961405f0a3c9c21fa82b0b6c8063356e01 +size 647955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1a769b423b..eb49590291 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f092d3fc01add619f041726b3343d02eb51212c1e87fbbf19086ad3cc0d2156 -size 737077 +oid sha256:b50fc8334738d492332571bbe16de077bd56b153a6ba553f44d6bdc7722b722e +size 760955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8d39f63c5c..2fcc6ac9cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79833d45df711cc8b4dd4393cfaf7215a98f8ca4c0c0a5010207e5a86624e149 -size 649355 +oid sha256:703fa16a4bada5a059dcc6b91628325b3104e826bca3b38e575ffc1b1b4aece7 +size 664945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 99aeb783a7..2885653fe7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebe429573625380d7241e9f7331dbe0bc16489d4fcff891aa9e0d39962e95831 -size 729973 +oid sha256:1927cafdf455f7f7146ddbb0ea11ee291195a73d10ff3a67e4b16ba43c50ece8 +size 751237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d8ce753861..e69efd2488 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3545a6cef96aabceacc0ce791973f1451a82d86948cc784ccd2c47b8c89dd22 -size 642201 +oid sha256:325e6691a00b3bff9ad2daec8e62fb3b4ac8830a2a3a74545936a00b127f4938 +size 656113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5e5d6e452c..05cf50d735 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f77734cbfafbfcf446e5a6c011253a60889af6bd9663ce7812390b47a129feac -size 740431 +oid sha256:af9eb1297d6b03c3bc2d7eb60c027196e0e6c9a3d52d90446279d95a134f31b5 +size 762335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8b30506418..6240321ff8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02e0c5be04de2866587da5dbcce37cd638fa40faedfb9470fc9765d731f0a924 -size 651375 +oid sha256:e2c7ce487f764e5f5c12f8349bbc8df5f7c3b3476fd5779c7de718c1101666cc +size 671109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 113664810e..512ce8b0f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fa5e3d27a6e05f073c75f0002c259e3c913fc6d0429e06f4bd1f79bfc06c8b0 -size 733277 +oid sha256:cc4185b62f92214f0df2c23e555b89cb8b4170c88f839be512c2e5987e0dc1fe +size 753405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8853be5f4f..722b98b3bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fbc3585d4a6adfaf3ab8d277751d0c34de56cb28e62a3c7fbff69e414edd813 -size 644271 +oid sha256:94130d9acfe0d507734934c66b8cad35b7907a075b9ece689392e2a614f67495 +size 660601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 95992871e9..a35d87d250 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95cac7e1a6ddef16a8fd9e82f2d48b7a4fece0420a564bdcf57d44afd9187a2a -size 807361 +oid sha256:73a525928a30b16881d26d6c375bef6279b1f648dbe40ef845425ac469eecf09 +size 830645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1a5a4ef4a6..be3a8f674c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48ad16404c8b037612935df8192aa19383571a11a9bfb87eab91ee7f9bf9c24a -size 721907 +oid sha256:05aa45a736544159d9e50a147678f377fdfcb9848b27a3d311fa339ebfec6bc7 +size 738681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4d41b8895a..a20f63c565 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddd4ea8ff3eeb445212ca072c21b136a519a158167d339fb1e9b96b0fd97dcbb -size 800207 +oid sha256:5b8a94efb9e82acb0378e5a890eb02d09d40a8985d1dae8b051b198657f77dec +size 820877 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5a3c5b9296..14df93749e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1de56090b219311c3769212d051896800d742d6fea71c3b29a706119b6506c27 -size 714753 +oid sha256:a9972c79264005b0701c45ab81209969acd57df7d5d7ea4bfd9719a83af3ac5d +size 729849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5a517385b8..65066d9ef7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a174f4760883eb2d25bfbe40cd522cb3f0d28f1f852b6a907cb27a6ba5e3e453 -size 633575 +oid sha256:bbbf5ffb10eae7b987e68cae47459ae79ec161a37f295128b7b9cf920a35d106 +size 638607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index dbd6dcf607..93539fb075 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e20ce5b364688304d62b262ec390088d5ed91f348fb022818b5135023f097b3d -size 546344 +oid sha256:394df2e6de859a4da0317cde06d5734bf1431450904be53a38a97bfead932f25 +size 552116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 6b5acdc496..281ed08186 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2474d6e1382113d32700961abc648b73a24ed40c068589480b7e24e27186137 -size 633473 +oid sha256:f38ac476d0a73eebebf308f4d5ca0a5c6c36c882bb402ee9b1b2f0e16bbfa9b8 +size 642797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 25f845e39c..20d778b2e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a2d882467f3f9b7fa6597085d6ba2353ad50436208666de9363ee929bc69c21 -size 548760 +oid sha256:2eebcf77802e9b4eac128cf46300e2526feae17b4eee6329767a9f730e1a1c34 +size 555518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d7a80cda85..983ece0665 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97f53d923024cf4e5f6ee204094971096de70ae905a8ea68e9180bbf972379ad -size 697789 +oid sha256:08e84d9a66c69651b70e64887512b43b0932e817850f0cda4d41e90418fdc1f7 +size 703511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 437e996a3e..614ddc53ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5353d92a16aac66dde3e6de3cfad9d85765d8e8d1828974d6f729a7ad9d6698 -size 613074 +oid sha256:9ff9ef40aed800496a9622fd94797335f93919a93e3cc632195ba97215cb86cd +size 618799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0bf1482527..7d4622c7f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9432ab3a4072e882bf6dabd8dc58ddf49a8f86513a1a92c67170155bdd7232f8 -size 721981 +oid sha256:0716b2fbaded07e35169437642246d1661c7ed4a8176b08c99e1a8cceff1deb1 +size 737817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 24da1cf418..760fb225aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1873f9c7c1fef153c88fbc1536fc3744d0ebe45830027b5fc33fbbf5a45a2606 -size 635739 +oid sha256:5cdd50069f520bb1efe2f194d376a296e0d7275e97a6d7faf0d2b807c142e5f1 +size 652167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bd1e3fa56f..13fb9f069f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5085c51e0f331ec8a8895ce3110e54d095db0a55001afdef2d9c486b1652d83 -size 707725 +oid sha256:9fc8677e5d1957982fe593c340f9b7f363ab7c8a6440196a284d4304e05ef186 +size 718381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 711a0029bb..6fa1467097 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6f8e38c4f83bbf17e35f01b6ffe4157cc862bca2e8d9fea6069a046bf44a537 -size 622221 +oid sha256:495f0d0184c055d57cd8ba43a7caef28c2cabb3615c0f12da36f034cd9520f62 +size 632779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1db0208c9f..7c800c612e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98c5b691813aff8ee107f88d6934872d6f998ec93956f4538b4673941bb20918 -size 722325 +oid sha256:cd6a4b882da3d77cf31e53170f6d1938671d024e0a9fd05f2d5dd1094670771c +size 745955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 25c255325c..c6bd888302 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:553cfc0ecc9584513023d514eb4acbf59d5283f6be69e0d0b71ff0360fda8ce5 -size 638747 +oid sha256:b27b8da059a9452058fb31e8340f751ec283a5eccea9581297f7d3770bd028fa +size 655963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e0dd5ed5a5..66d2e2fa22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32a0182384d825f49890b1b14ca4c417268a4790b814339578061e02e1e4d40f -size 708709 +oid sha256:42f483858df682c6ab034a70d168a7b559b65c2708a1fe0b69d1541f6fa32025 +size 726469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index eee2d63bed..d7ded16e4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62ac3c319a9709906a8373c9204d072a5eab66f652ad09286e73d0b55008977b -size 626067 +oid sha256:b5f43cc289852816a0c7149e1ed30529226976dac108ff4dcd6fd804b6f07899 +size 637365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 627c344bf3..64afdaa156 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0925ae049f00507c0174f3bb044733c883f712c448f43243ecc9bef5b6ef43ef -size 794189 +oid sha256:a41644839bfe74da242653818380a199b4c6970b6e3e73c7a91409762b939c16 +size 817917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6e7ea97eba..f11bbe0826 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6ca406a9a7deb4a05437d801fade35caaafc9e7612d2cd9e75adef9ba6ef872 -size 703949 +oid sha256:3dece06b4fd55ac98a8217b8f1d7892de3a15793be2fb65381e5c8dfc0f83be5 +size 719489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5ed3692483..fdd601820b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:063b9e6092a5832c86d84da04e5e2c4f45746289e5a9516ca784c81fbe2c947f -size 780671 +oid sha256:3b69aaeb3199062b50d7c751b9af4ccf52b02d0524f4736fb73ff45ae267a0d4 +size 798431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 95ffa21f2c..42f40c6952 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35a15edfeb58f5c222a22c6988269ca244177bd630bd29dd68de87d6ae1edc46 -size 690433 +oid sha256:ae57a7dd65e597bccd5b9907ea0a33b0734143435206a8a1de2c2ac63ea90053 +size 700891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d61fafa5f0..e8392f6c8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59742af87fcb57118e678950cac6161d7ddf249883444cd5cba804d155a3d3f9 -size 671951 +oid sha256:3f7bc1991d079e2ba0a35e544c725a541f57f21b5d515adbad6dc28789d24839 +size 668153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 226065267f..6d19dcf3ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94aa4462cae0190edba5219382b1ff2b7e4e9fa0ffabcfccdf0c314ed599b73a -size 544958 +oid sha256:6347ff294c8c0126af4642fcb05bffa534b603d06802c3a84442fd2581a8c3dc +size 559808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 2efa9eef9a..9ebe35abdf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67f987e1d12e5362455db0b9e8720b96357ecd154f59ab954c6bcb4fe54974e8 -size 677327 +oid sha256:9aca5773456ca47314ae5104a976b8b90fe053e1aa051a89ea1d669ef43bbb69 +size 672197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 155f3ca538..74d1b7be9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf9547cd369d39675e380709b715da575a2849fe2e13bacbc5e6cabec5f177ef -size 550876 +oid sha256:7666b16f01866d904a467c842428ba8a37b4232cfc93420e81e983ccd558b896 +size 564048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ae3bd82317..a46fa686ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dde5018fc8d970b90419aabca4ed13d505ac6da64b1c26018781377b1f054d4f -size 739767 +oid sha256:aedda1778b5bcb2b2c4d9f87c818a681db5f2d78a004e767f4f6a5b211c13666 +size 734587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e050c3d60b..5f022a50e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:273b4dc0cfd92c38229266156963264338f448f057f17d2b58d1a0666190bfa7 -size 613910 +oid sha256:f71fc4bbd01e7b2e90cdfad00b9f2bd1f77a68b01e179bf9371cae1cdb0544a2 +size 627329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3c47841c3b..9d9d91c5a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0305047e7335d8e946b8ab4f1e4142900ef140634dc40df57f620bc499ce1aef -size 757991 +oid sha256:5f7856bf450530ef08caf92a7780f877645d88f936aa041b6815868bcb4af622 +size 756857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6cd6bdcbda..5d3011748d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66170e5305178c7a024f63472e2206e12fafb39a74bf789672cb360b9cf32250 -size 627249 +oid sha256:7dea8b537cb50df578771be16eb29f8ae8fcdf4224d7f100ae7ce9e22feb718d +size 645109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 76b93a5d55..66a7aed72a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67c03552838242933f337363b0a2525e27cecf1b848039cb5b9b4b89e41bbdb5 -size 750837 +oid sha256:a2740e7bb98aa2fcca35cbd91f966f77ebac76e84c5b67d44a966ffa005b5a89 +size 747089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index f0a3e589e4..04483a76f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2bc5e8ed31533209a1821eeddd19d34d48c82b7d3f8c4c3dae279fdddb464cb -size 620097 +oid sha256:454e63ece3aeeeb3b3306b3fb42dc10eaa58ec193258e5ae41c58a6059859016 +size 635439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a9a0934d05..aadbaa5ed0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:feb4e0ffa152b47e875b56ecf3282490162a7e5c660e90c1a1519ee86844120f -size 761689 +oid sha256:da3bd16df07bc1048ffa89bc7a85ea8005887bc31633f328c5017da28a6725f3 +size 759963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c270b745fa..8910fff200 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13fb2cec2271191918f38f7baf5c5e7c0c54928a4fc8370c794983e4f7a2271e -size 630899 +oid sha256:e449ef07b4b4ca01f808025d53f960c7d7f8859875188e3df7eb22ab817217ac +size 650483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d7a6bfaf2f..a349ed336c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9869109d5cf31171fd0bfb574cce9811bfd4d19d1d1b25b420b5ce0038aa41a3 -size 754635 +oid sha256:3013db7d029171c20736022cd0295df96f78f124ee1b22cd6c8a5d81e19daaee +size 750095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index e21f38ba75..6a5e21002d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:149cdfbf4966d97b69fb5e90c388c12a7af9224dd49b2a9e029f2b30db763faf -size 623745 +oid sha256:3ed50ede3a991f938826021d2007bc325bdee7028af359d3d4b965e0cce12770 +size 640815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eb5109de56..d965a4b03f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1061a813b5eba703b0ec0599980133b81f3b7e4c45a145eb037b61980b24539a -size 828323 +oid sha256:0711727c57e8120aec6869a0ba3f8e39dc9c449529e4ec6c4983e5638fe75c8e +size 827139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5e2d1fb6b9..c1dbc7ef8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:004a8cec99b1b5e79d646d0e5c73b81037a79c7b6ad59620e7d171778527327d -size 697237 +oid sha256:3e1ae74d6986be8b936a0888db905240925903ea06187b7773fbb3376663fcb5 +size 718893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0e7cb8f6b4..d1e23df742 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:361aea8885f2ebef096bc6d934241112f112c9f7566077140d1a476b6d07dab4 -size 821169 +oid sha256:a02a01c81b088c74ad2813cc154a7287d6ddc57c07b901ac0f834bde38805007 +size 817421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 831d7a8a4a..ba918e1a53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:088de2aaa906c7053c14346631ae50ba12d90916e04ed795a93b05c920b0dbaa -size 689935 +oid sha256:7ac10034b99ed2011c3d411e636107e314ecdb510bde0d152f53f0be1a64c42e +size 709273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0302c71c94..d70d151cae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb7723e2207198f7a92fc2e11cd4481621b8096969f2d4455edd96e4c9a98e96 -size 653603 +oid sha256:7d66f7f77cc1329c12e94180b316aa70c835097432720873eeccd74eb29afbc2 +size 682563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4494ad8961..9082ac2a5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cc7b8e943766399aea5204744f2001f17a9215f53940bff284e0a5108b5818f -size 576882 +oid sha256:cd47e9e3b611420d51e340fe95494640508afe0975fbd2dbbbfe57a012b6bdba +size 595924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0f41bb2390..3ae1bafbf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6158837cb83ef2ef478a4387b440bfad6a062f00f78f204784a12333aa368fd4 -size 658485 +oid sha256:9aa91c89a7255936eafb05eb31ac14b2e7332500ce77d992dd5866595ed4f2c8 +size 688233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index d1f65d2c27..6deb03a2c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75f787df80a1b9ea4caa5f35ff356784d73424ae66d510c550983595154869aa -size 582700 +oid sha256:420b584b09edfee2531dff86a0a0e1b6a98a9c31501d508d37061f5f88b03146 +size 600904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 10ef8a47a8..3c1f12e5a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cf727e3e8b14cd9fc9c7e9b8732686ea17fd22c8a164f0dbfb82ef16253f33b -size 720631 +oid sha256:5d6e821a6dbe8ecb18dd9cc9f96e5d150e0f566449841a058d11cb822edf2002 +size 752895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 365b8ec5d3..01cfc0137e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4780ab060e5c829c6479650cfb473e2c87f5862c59a046f0e87a631cc831e496 -size 645931 +oid sha256:d054010c6e53571ccb7509cad288b2cad34b51de6498b77bc7e5bd33f0be7241 +size 664185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index de363cc75f..0332dbcd59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d010522228c55a831b4b67b37405c1174d6e0f4253a19a404f79aba04b919356 -size 739791 +oid sha256:a5b24d49d30a9b956f4fb6c36d69f6bd10290993e1d94986fd32c2bd65d54629 +size 778665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f573bc16ef..52005339b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6586394676f8aed00264c89e1ced1935456476ef49361e67c46872fdc846fc5 -size 658037 +oid sha256:2e25806d7293fab08d2f994bd0617d0001f46ed409c4d955c70a9e36f4b6f7cc +size 681175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 69956798e5..7657f77fe3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea73df57ce79a72a9f42d6d96f92330f829b074daadba385df506262d8242e20 -size 732687 +oid sha256:8ae203b027a0c9ed6fe5d340cc7e433be9ecb6e580af28672e8f1ec4bfbf6bf7 +size 768897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6a49a5edc6..d9ff60e224 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78544e2672ae08001a638118a386bcc58bb7689846a0b73612a05f36066da033 -size 650885 +oid sha256:2a7ea0c8d7d13db4739b755b61baa67ac3930a5e8f2664864cf2c7725f87ba70 +size 672345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 614531f126..1ef88655d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d3d6073f52fb902b176c5fdb27722b19cf0ec0c6a848a9a053eef1af2fee980 -size 742897 +oid sha256:1707708e528f4839fad3fe88d495c5c318d0db7db2f7b52eecb6c90912765da0 +size 780045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 209feb1cbb..8848a3ab4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ead8fe2cb324159c754f49ebf71bc2a52105b22efd7bf58b8a1ec7cce5c4406d -size 662525 +oid sha256:f34408e563df231b3115b9c3be986cc0fe65eef98b747030d85c24d880f64366 +size 686451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index dcaede1117..d8dcf14259 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a09252b6632230418356b8185d647afc6774677f9b3ce93b13762b84efab6615 -size 735743 +oid sha256:73850acc8aa4c55081ff7c6f421c3a0369b415fdbdc5c83b40babed266acc553 +size 771067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 24910c1401..2d55a18a50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47a20c4eeb13d2c607f5066ffc1f9751d9c1b0dc7d2ba2174c27294579f2d9e5 -size 655371 +oid sha256:fb38ea59d4b8c9f07fa237f026722e3146c5e5520338167c3a659db6f0953f19 +size 676831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dede6e337d..ac64abed97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06e90bb9d30d0803ced94db6bdbc6a457d1d77b0e89a2aec985a05c754352988 -size 810073 +oid sha256:ca8529b0876a6afbfb859372c40939670e7070e2b8d5850cda4a41927289c19d +size 848307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 19fd0d3512..9c818afa20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2d15559988d23339b1759639bc1d23ccbde8129686432b2acf557342cd8b5a3 -size 731429 +oid sha256:f9fbaefa52163e84ac16e1ad4689088348ba1d8cc940fa5e7940dcba89d39c3e +size 754911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 189c08848e..513dc6fb34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77d4081ca8bb8dad8789102e2592c2b82885808fd6b1dc607470334c722a94b5 -size 802921 +oid sha256:e92a1334bf4350911703ef4a13e86a58a4b8729a0feb390b224dd43c97dd1ab6 +size 838589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a63981bc31..b21ca9b5cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54f948add67f1a38b8a6e6631592341500c0e1be9a64503f5b4348b0a81bcc23 -size 724275 +oid sha256:ebe3fef20e445a5558777a21024444bb58bd5d685c431570b2d33dd93124b6a8 +size 745291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f965392f5d..dd530ddd09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e3d4425342f12a662650b31acc4d975ee9ca15f3ab52d8b92e4d69314ee8d59 -size 648177 +oid sha256:df40e24d3814cb81ad8fa84049f00a278af632e778a8446ca34b698702729b9f +size 656071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 526f6b919d..4f6025a6f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9d8b0f513d490345075303ed88104f223f85b69738974acf306ac0ffd73b0cd -size 557542 +oid sha256:342305d01b6091e237d04b1fd4906825e9b5b0ac060a1199adb78725c4375aa7 +size 567754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 371a59feb1..c6650de383 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1385cfdd2b19d81d76448971809732384f40fb424ea552ae2229ece495f54ed2 -size 647829 +oid sha256:c9a9abd646f8403a2da380dd8a415b5cd0844e41d5af7f0fe6e9d345c04ad78d +size 659571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6dcab84457..6c28f75aa1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1167ead156f03ea623743eb2c403f90f579743d2a6d713f6db7dad201a96984 -size 560748 +oid sha256:717388feb7d94faf67a5f9f919b4e6c9b4e7394a4eb928f512c5c359801719e5 +size 571058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 706463d679..f501df6d34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2925f07ebd3d0fb2fe91b2c9087b0be4dc9d3e98152581d480701062857aedf -size 712145 +oid sha256:c7198b526ceb7f373a54bb021fd5fc4199998fde9d47031bb2200dc729c09816 +size 729511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7e60c26354..400e1eeade 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0e4a042b412cbe9413431896ca8c6158dc4d4a966217aa9061676c6d4458db8 -size 624275 +oid sha256:28eea0f96173cd63bd6ee65b7b8363002837c5e17e2b6674e91da7fb1a035abb +size 634387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c92fd69556..bd2fa93910 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7d632ca94648f9469c80d4bb0aafe4ce9f40abfda3c411facb0368e782b326b -size 736337 +oid sha256:ebe87a002f81e908431dfb6da8339d4ca68a2d639105c67c16ed9f34aef5834c +size 760117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d070a591a2..4090240db2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eedc93ee8bf0c07260448d52718acdf777cd2f42f60bc1145978e3b381c6f51 -size 646839 +oid sha256:0b0f8e8698272355625ec3eba4bd3d261ba8663d9ba5b1fea601e0f2b10288e4 +size 667757 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 84715804ab..86b450c79a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f8ac3ab916c88c143f57c952ccf636c767436a39173a40f6ccb9528370cd177 -size 722869 +oid sha256:fa4604c3ed12c57ee12294673830392595b313d7254eb20e65e14e4e04167c0a +size 739889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d45541647f..03dd817faa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b9d99e58401ff3eadb72e3a183925947d4dd5727558892950649ababc667b69 -size 633321 +oid sha256:ca3f516d0db1db8fa141ae38309e4e400494d8dbcc56eef2ad1082b51b667f67 +size 648417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8f25b2c4e1..74b69caa78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c4977efc8cbd9581787c63c1f627133bb282aef6071e632bf0537fd5ee1f12b -size 741615 +oid sha256:ab9dc351821c3a373aa4cffe779137b92d78c1195e53621bf5590689bc1c3663 +size 762631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 35bc4ae35b..41957ed8a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e794f61517b036d919abe3cdb685281588f682d7413a3352ebc04c75e80d3513 -size 650635 +oid sha256:c1b4ad2a91e1de9025f88b9f9919fa552553930b5637efd3e17825aeb25883d2 +size 671651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cb62d5c583..f169ed4aa1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:132ce9faa126be08479e4cfe3df44629ad46231bc3d43e471515460d3a0592a2 -size 727307 +oid sha256:22568d7b80f73608bc38f86155e57bed87e87a3a46793899b8b8e6e3c84b85bf +size 743143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 57096b6f98..0bf7034c44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dc483efc28f554992e2fb78f3fecf9f69dad30387af5d3ce4fe29d00394c221 -size 637167 +oid sha256:c456197738585cc014a04f13f11d336a96081926ee3c3be7aa215af778b96bf6 +size 653103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 05ebe4429e..6942ea0b81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:366a35c0ef7b2c4e2b3e1f2ee45ee905e035c6381c9eec35e9f9d95bcfcd5465 -size 808545 +oid sha256:20c220f4cb8846aa1bd132dc41b875e7914a473c95564606c13ed144e1fad433 +size 834593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 71a1273a91..79d7d15896 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:960ba1a4a31057192522851f6a47a876e149b79e58a68405061f68b50960a35f -size 715049 +oid sha256:a2f5481dfcd9f221369c8cd15f21621018c36986e654d718c4bfc948ff6b4ab1 +size 735079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index aeeeea7f59..dd594f0cc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c03ab373749abde0b47fbf08a61a91b9f8f401b2976ec2d4def2a47e221465cd -size 795027 +oid sha256:677403b97822c6228df9328792c95d4a31f1b83e7f24ec4e7437240ae46a4df2 +size 815155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2e772334eb..c4b41eb950 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e934763e9b54f009cef81323ccd78e4fd924e335c1728298ffdb83dab30195e0 -size 701533 +oid sha256:d8643c5effebb17fbab2f30c67803b8b04c9700b75cfb74e4983328e84d7b628 +size 716529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 40bd42a3b0..efb7cb2707 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d96d8d5088eda1e97a6b4b904b1567eeeaecd9516406e108ef9fcae1eb471640 -size 616994 +oid sha256:09f54b8ae77a102f96fb8eb27e86ca2c199fb8c5b88c88ffc98d8c12d3790c41 +size 639935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a4986e76fc..de6d9a3bcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf27b6670cc4365950e16d00a98beec8bc551a8008d9a8cfb44ac34a21e351c3 -size 535586 +oid sha256:470a3b8bda68a921465960cbdc394cac6ed46f75d4018fcbcd00b1acf0951bbf +size 554036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 74193eb3c8..ed984a0118 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0dc592d16e6d9da74dddeb0e2b152fa2bba73ce7896c419a8c9ae669fd9a2bf -size 622123 +oid sha256:e847a72b4cec7203ccb485f4c76794f77d408b75ee79e068b03307ff7825e06a +size 643139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b5c4a7f7d6..3802a760c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9df3560b88dfa6d14b1ca2dbfe4150ed4b11fe6d12fd03f8e79a37e053c68ba1 -size 541504 +oid sha256:87d8a2081c822625ee94e3a26a40e0a71ade527754339ccd3b69ccb677b3f013 +size 559016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5628ee44e1..5a76e6d91f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed66ed0be1cab2f108aa680b90196c3c49b6a608f3a5633c8d7c62cd89d29f6c -size 684811 +oid sha256:8cccb4fb89a15dffe3156ede2ae36bc915769ae1af4f757afe385ee1aa574294 +size 706221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 12a03edb9e..aaa8c6d1c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1272c1eddf35fef1cc981e4636bba2b49b2b95dacc52a6c3ebcdd9b831bab48 -size 604634 +oid sha256:d65a04c104fd281e126cd9c037758297f21d13f3dc03410910f0c5473ca7987d +size 622297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c91d2b8cee..ae9e94a5b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b2838b1924ae0bb306aebf8d9914e158948e4d0a5304d493c5af54e04669355 -size 703083 +oid sha256:1eeeca97dfbc5707a60fc16929dd496a1d4f758726f729dcf7ffc158f6bc2021 +size 729525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1d3895edf8..dd63ab8aff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93f38c66a04ad7e2c322b6a1f52a862f989aa1c6d09bd4de2fa046b40f73d954 -size 617925 +oid sha256:8740c3d4a00a632832a48802c2fba8ff9d2c9f4fc2505369b19d989dba99942f +size 639287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7d40a53e81..f52831a36b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd6f51a31458884d439c62d2a2c49535bf5c6b70ed6e7d6be54d6c8b01d966c6 -size 695929 +oid sha256:a0da6d49f0fdc2407be8f1f5061b1d61186bcc767b1f4980456ff8eb005cc565 +size 720597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c99c5e3495..20f5bcced5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ad96edad741dfcc0d8007d9917e57a4b2891a3567b83716dd086d8140ef6b29 -size 610820 +oid sha256:07a596bc7fc978f3443e35a051ed7361e5ac68ecc1defa5d9c701ee599471ac6 +size 630457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 31285a833b..1bc1727dc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5caa71ea4e57461b8fbdd545f75ca09501efb7029ebb1aec22fb2e665b9fb7e4 -size 706583 +oid sha256:1858843aefaafe3d287b76a0bde1f16f51939863684a509971caa8300353b790 +size 732483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 343e142e01..610b30c41b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecfc5b4e398aff01854541300cb9944168d919a423573a38a6901841dc8360a7 -size 621623 +oid sha256:0e55e258b177547e5f8551d8fa7028a7fa286e92537c15dd18aff3bf5ea810a2 +size 645451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 23a2431240..37ff71781b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ec494e84c4936ac572f3cac3773082eeca984995023d38ef925fa4c0b806246 -size 699431 +oid sha256:c58022691b58dc27a19120a1b707675ee972d1383166aec029023b636373baf8 +size 722765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2b463ef21e..a42a7d5f92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9af498147dbb687a59f9380d8d4d0d9b9df972a9a3aa6bb504fb476ddea21382 -size 614470 +oid sha256:c2f4cc05a87797116fcb6483a67e88002031c9ee53f23f8f867fd3f4ecc1e425 +size 635043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5c73ef5e07..b44de552d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddb1b81ec33f46d2dfdefc9163ffbfeb6bcefbc4b75c43e11d8f9acdca365f85 -size 773415 +oid sha256:3fa05388a684ba433b8197086dc4ef3ff895ac5ffe1e3f7de060ed6097c27b7a +size 803065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c7d40acf39..c0113b7db6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccc8a00a80d11abe92d1f0fc9bd625a5f4d837b884dc8d1f30498844bada6543 -size 687813 +oid sha256:e6351a25ef5699eae57d19400ead832ad90d6c6c9e01fd5badbcf5492657d713 +size 713023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ccdfaab489..a2e376611b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0bc1585f3c4c2c69a05d196de2df775336161e08a35e770200ea2ab723229af -size 766311 +oid sha256:7e824fce818105e7268c6e944faa160cb73762a0c793d921713d643a7d0a1174 +size 793297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 72f55e067e..a0c1abb56e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55e5c6b318e5201f8a62d5e7e0b01d5251ad9298b26cefd8c0c9530155aaff78 -size 680661 +oid sha256:448c2788b4726aaba61b8422a794a89b1cb95583e14a1f2653c529882be01cc9 +size 703403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 513ef7fe8e..026c73b251 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88e43c5f879c6ababd16a308bcdb4618affa07c09ddef7c87f7bf713c9055d86 -size 631995 +oid sha256:0c2cf5f904dfbc8f09408c26db36ecbee05fe7c11a18433b44a8d8ec4b314b2e +size 656711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8d50983789..f69b3181bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fc4b12e3eb09fd7a599eb676e0e1297569076096511a94752ac0e31a42c447d -size 554582 +oid sha256:c0b36c3d46257140b21d8408e8aef1cd0ed9fdcd7ef969e76b2d98eb5d94dd99 +size 573378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 1e9058c076..f4be62658a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e42bb5aab59a8409f3311da30a1a071494eb2e075303883f9213b53c38bb9f0 -size 636877 +oid sha256:810f4a9661d275f5943a7027dc32df7c802bceedca7ec64f40941c2b6efcc054 +size 662087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index aa86e9503e..e248e2f571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3daddea950dbf3c35f32b3298ceb82a77428ca4a036f5e1c67e7bf384715972 -size 559612 +oid sha256:f33f254673a5528c0aeeb20effbb3686601888ad4e036bc824d89d85c834b826 +size 577570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 82d507e16b..193ec7eaf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:664b214c9220778b979ef4f3773ff5670a67a8f5ff86ac141a3930eb5e6a22cf -size 699023 +oid sha256:f87f56c8449624d6b2ad02e8c231792b57afbc1a879b504a022d248db64f6ca7 +size 726797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3b59698488..c5e1c9bf38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e3325a6c9ada3b191b966286653185ae68ea159d3435c32afdfcab6e714dad9 -size 624867 +oid sha256:85546cad81a38087ee7b88f1a7fff6ce25258d714c9d4118f0929438db4c6b53 +size 641639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3913a9ec80..61e496d79a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f359a9ef05d3fd219b4972efb6e138ad8fbb3294fc02e031f78be0f45717d2aa -size 718233 +oid sha256:bf5c4403f86387af93f768995e9da8b6b842cf1086abc5a9a37c9147d4a5dd2c +size 752569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8a6bf39771..3174a08450 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d5fd14fb2053d204ead0f14c84545c5a3d6997886f3cbd82ae56a806d3c0bba -size 637811 +oid sha256:e93f32ce255e0468e25298e2cc973ceb935ad6e96e545a7459caf0a664e04dcb +size 658629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3154749fca..5f2babc8ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb0897cf01534eb16ed1d4e185b889592cf6f366604f71b1d403e007564c7400 -size 711079 +oid sha256:2e2bef15048374e9f7aa50c29a9dfaa3494c95a315c6a05d58e2796d46fd6994 +size 742849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 99d139cb8b..1600ea777a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:695993e588094a0e9ca5568c77e2b2bc2b45e394199700addc9758bb444c7e83 -size 630657 +oid sha256:76b203cc83fed13598bc67487857838cc9ca2a8c4fdfd9b8b16ded09fdebf0af +size 649799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3ae5c7705c..ec2f2bbe01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:372ba6e8224ca35081f0ead28a07a3b64f7f9f85c6d381d6dffe58c1ff33dd92 -size 721289 +oid sha256:32d2ee209300a85faf1b33f5f949531b5957469d50f540afcb33473de62ce9f4 +size 753947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f42c1cdfcb..38e2c36796 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:318103a47da05caf23f6ba975520eb24de7c6986314e2d05fa2b667df2dcd1b3 -size 642249 +oid sha256:337e2dc44db6aff4022beb7f1d3d0f66a96f60251650d9a8f9a683fbe304972b +size 664795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 941536c128..915974028a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ede4592a267c4493db0c03bfdc98de56025acf49d173de8a00e57fe43b8d8b6 -size 714135 +oid sha256:921505e65c43697569596e9b04eec681545c5c4b92bfb897c6e07f92c6f4fcd2 +size 744229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 12934f2409..557adcfdee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b75a8d9d2778b046bd4a0b987e8b3968f9d2427a31c2162994463d81f851c1b8 -size 635095 +oid sha256:c08855e161e5e600fdc91381e9ec63d117304213513efa1590e231d70155e3ef +size 653497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6a071f8d75..380fa39d3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5b5903e5235ae98590ed03918f88663b777f1b321433fa31eb3a4531a288a23 -size 787677 +oid sha256:6bde89e0e095fc31092377701ce2ae77d4ff5d5a6968174dbd80320e064ed58c +size 822209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 790f768911..1732c7a9ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5363b581e4eeb8ab783fc74890b50c5179081aff9046730f9e112a1845641ca1 -size 708587 +oid sha256:2057c942bea26d0721033b47572e855335f435d255c8e1bb40991eae45034449 +size 732415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6109b97881..08b1416d44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83d8076383c35bee6ccfe3ccdf2f443cb284f8b8b47c99fad6af7e6f4a832879 -size 780523 +oid sha256:04e7988a389a6345defbf42a89c417d05247d71de1cc28c7da6706647fa18e9c +size 812491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 939889d991..1be6a9bf7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:706d506b6950a6ba26c5602bb77af553af3558536583d83d9f6d2921f2ae660d -size 701385 +oid sha256:aca3f8117fee6d3acca8e33117bdf6ddc495fb3df5578ee7951b0960841695cd +size 722745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f8520270a8..5819b4591e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26f8ef06caad000af4a4d0b3942bd65f2dba10d7318680aeab4021be5fc1c65a -size 625731 +oid sha256:b1e76f4af339b1caa79643c25b3f43ab6b5d779012495624f0804d28a7625588 +size 633475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 94ef2517a5..bbe7c11393 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9710825b864c78195f01e4991374f63d730095c7598d4c92a30fc2012947f7bb -size 538302 +oid sha256:0bb23654c9ab3b827c9c6ba5ab9f61bd98c5f13da9b7c6a0e408cd619e78b6b8 +size 548514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 02aee15e59..8b64bbdfd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02c151ac588e4e02833859d1576886a4de4969526ab6a381b6001abdd3aa15cc -size 625433 +oid sha256:75328d16f046f780d44ac1301cef4ba03a77b0eba280b8dd98394e62c11ea21a +size 637075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 35a7a8d4e8..6c30e5e4d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef333df6b0d8fb6687dccb3e825c46386a15e5540eb03cb1b158ee88f3c99b85 -size 541508 +oid sha256:47936b621ac25c3f55cfaed18b4c2e5ee7ca95bf5be1285b294069e3f6b18dd1 +size 551720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2302575af6..785837c0f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aaee75a9444b93982c2c0f838337ba64239ab8e3be6ec0023c7cb93f5ce4bc12 -size 689699 +oid sha256:997c61d48e63cf118ad5e7301e11dc2ca7611b38d5d5fc9416a1b7fa19f920e1 +size 707015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c6ac1a3fe2..eb38df1a62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b4a26edf441a7e0b82f923a88adf7707367d864746d707b209948cff95c2b92 -size 604984 +oid sha256:c0d63f020b7e9bc6d82c80479183ae68e516e4aebe6ff759074fe0bd6ea77f3e +size 614406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b8f751b14..5a108671b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c476585260eca4e7299fee6b9742be7cd80ad7e3a11003b67492f27d5081a9fe -size 713941 +oid sha256:6d2d1862e557ec4037df9664eb1365e87bc626dfc8e6054312cc90e2a60c2751 +size 732095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5b057ccdcb..80ce6102d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dec5cc0d3bb67857917283dcd1b6a58d7ea0e18111e67786af59e41b5026732 -size 627649 +oid sha256:b466d835380260009bc081423239b587154c7cc81d91cc6d6c4956c5fc9f1190 +size 647777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6b49f2f686..9b0a4db6aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01aa3082c3ef5d7bd9a3836103b964726fbcc824bd8044b9e56fd136fda57ad5 -size 700423 +oid sha256:6570bba626b1ff6061a5f39476fcdd973596b07795d411afac831f160047e294 +size 712657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e273d3ca27..77345251dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46d9e8035db159b4afca6a508bf6ba8099fd735e049eb49f98606b755248898f -size 614180 +oid sha256:41331d3c3aca30f345c20ccde039e3f42a21d02b579458c226a4cc2b505761de +size 629177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dd24379998..e987516a92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4d3dec2604d70279553f63f95c748e7b78d707158eeebe8dd1665763e8449df -size 718427 +oid sha256:a6f42bba05b90d9cc09f49a98ba5d2ce72d65e343cdf00ec12e905697d6c152c +size 739987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 85292c57bf..d775954a3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51f91b0068d26d3528321257145ec71dab650a39933bcadf8cfd61fb602d0eba -size 630705 +oid sha256:aca85a24a81019a625ad790290a5b5bdaac2d2ca4d099fe1c2678af1fc50288a +size 652363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index db88e27dc7..5f89f2f1d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54c792676237c4ba04729f6679e324f98b0ec2e8108dc52a917cc4f834e74ac1 -size 704911 +oid sha256:9c342c234ee2ecb5958b867972ec9bab3b97ebe9949be9aede159235650d62e7 +size 720549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 30897a7696..15f3accf5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a62dc2f8a3fb7b12737fe53389238e5ba0177cd59a922443053df8835a7a446 -size 617977 +oid sha256:287ed54b9a2af69e18833d4a76389ca0c48aa6a99e167d5a150a9f821d1eaf09 +size 632975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2943b70335..b1b638f7a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c72d25c2b00471b9cb72ad13ed9322544f2b0f6a859fba0d0533236cdb9bfa7 -size 786097 +oid sha256:0f54576c2241d8c10ef462f1650db88df98d844c41d7eaf54f325115b3e3c4c0 +size 811997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3ba6be59ab..e864c79fae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5caa9369156f8425e14e73fb95d39847bc03e537e3fc6227f9b3efdec7e04443 -size 695859 +oid sha256:6c88d4daa522b512e30c42a3578f39ca081054e5bccd3c6bbb0f16d4bba30ca3 +size 715889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 937b42391a..18be532ef1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed8aee5f00a032d2a157e850e6f46c6b1c75bbe76fbfc0fee4d2c23e7ee100a3 -size 772629 +oid sha256:b8ef25022e902a3777db6242a85a6f1f0a510894893b78b866557158559d912e +size 792511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index dec8dd4960..15f919e34c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e4642d329d245ef050b508c58c305a3b4e3815dd7f225a70b5c19528ca49567 -size 682391 +oid sha256:0486c59bfaabcc358599195cb0856de6fc93797c5e9433c24414f14e98ec9154 +size 697289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4b1d0e0c77..497baf721f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f68d9c7816c1eac4ed9169609d7242db63e320166035f5ee559d9254fb32828 -size 625923 +oid sha256:2678fcd2a795035a46d3b755e4507a5fbd22e0b26d50dad7eccdd0bbfc5b15c1 +size 645459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ea6f71a39e..e0ed25ab94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d245b58789f9373adb4eb70e91ca54b3ae0d60f462dbac95ab1b1d3dcdfc9353 -size 543824 +oid sha256:e7878921e11844c8cd399aae415d8eed7b4d0e9b63d3a11335f09470e9373d54 +size 559610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 784f1454e6..899a3ac091 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38e20344033b2555abf660fab37a8a49b3c4b6fec8566778d97472ba705d4a95 -size 630313 +oid sha256:446cd09643e3999485e26998ac13d147f960bde086c867be4b666375dec20393 +size 648713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 1553f3092a..76a5914880 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5e941bddda768efd181a294b360e79f8a6bb4f5cd9933be5963e803c1583c3a -size 549742 +oid sha256:b29c2342cf14fbfc8be4b8ca9dc868169a4143f2aab4356bc5940c22b5ae90dc +size 563802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 67378ec631..b93cc5561d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:672209df04c0aed5846a6d6eb6455c04707b78612c5672f61d3597591dae7c35 -size 693739 +oid sha256:734fe68ae3d658c139b19912738d5a8f6445c748d9239f85703167ccce274751 +size 711795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6112eaf615..97d51076d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19bfca8f73e41d395d3e6844c6dea8adc8a744a4b27b30dcdb398758b7a542b4 -size 612774 +oid sha256:04136587327bcc1f8f0bc567fc816d523f9b52b914fa5c6c0c0d85f0d5efc16e +size 627083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4a7778c54d..9649db7905 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64b79d82cdc59862eae7a35357bc4205a0606b62f4b637b10cf83a8bb3ee52f6 -size 711223 +oid sha256:891c98f77931a3049b76450d8734bcfeefeb88322cc9c5d1dd08e2aa5e043f85 +size 735101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index de430516e0..f1264a3244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8b336587a1cb0937b20ce5822ef3d0b5d84dc7f97ed154e3675e2a949ba1143 -size 626115 +oid sha256:43d34a90d6d590e045aa28bbb383efb1dbdf21f8cda040a72faa3d500035076f +size 644861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3a015a77a2..b602fec9bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fbda1584e1e47c18003cf7e2e54df34602dd77510f5e5bc5f437cd15babcb2e -size 704119 +oid sha256:4e3eb3da576fefcf427dc01619c88ab4420d74e743e059512a3c3547f69a61da +size 725381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d487e133d7..92a838c72b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12a0ea463cf9cc67132d0ae3b137f6abaaa293a19a4917a4afef6d52afbe519d -size 618961 +oid sha256:831386368119d2206a1f7eb165cdeb2126979fc682bc0f511165f7c28331db87 +size 635241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b43d765856..917eb231ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39c2e857d3ebfe23b534062efc2fa14c2653e4562a35b07d44e927a2d323a015 -size 714723 +oid sha256:93ac6a8be4751f2e943582b20fa9565042f005087901364d4682a5a26ee8dce7 +size 738059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index dcb98c63bd..53ada6cd42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:531eeee28f5cff0dbcf180924ce68578586d438bed65beacffdc46bb6280d0da -size 629763 +oid sha256:a2b14d15c1e482c81f59fc55565ab2c011a75c1119d2652634ba027651724f2a +size 649299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 45f1e1b27c..3f6a7d310b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f69859630a6e631082085bf0adf2a2b0165e973be9596c2af5b546212d87d99c -size 707571 +oid sha256:6f6bd9d1464e821dc335cba2176f12a846faae3d627dc24d58a429e8e145f9fd +size 728339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index e548fe5f89..1741aa7440 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c4184f7abab87d094d132e471594c6cc1363af0d8e27b1b2ce39c6d820aebd4 -size 622659 +oid sha256:6ef0052e6e9114c66ed2702294a6d808f3698f2294a81f4763f4395f01a54af7 +size 638743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b2dea586ce..db0f9381e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c17e311ef4876bf4f5665f7e1596b25e4b8ae0b05cc4b570eff25db8527ec2c1 -size 781605 +oid sha256:e5456f0aadcd1532a3454adc04fe1219725d4bab1a27e1cc111c8b6d7eccdc82 +size 807849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a4721e1260..e8e12bcb3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b373db2f1cfac7d96cbeb3bdf91993ca6ef1a2411cb4e4f2d55ee21f280562b1 -size 695263 +oid sha256:69788206a57adf87c10f060cded38202a067d8236784f46b6c663e751b0f7872 +size 716871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index aaa040fd40..3a6dc6f852 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1a622e937e540ea8aef0d426b9d0f914cf43a753ed14b4fc1598b256304d307 -size 774451 +oid sha256:ecea6bb5fc5dfb27549b0f4ed095ae494e17bfc9cc90870160fab44edb98a486 +size 798871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6341ff85c7..c641271aa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3483f215af77ad70577a3700f4c29cc33599e20c231b45234dee0f3cb4fccf08 -size 688109 +oid sha256:13714364b70be2c42e8b9583e1e1fb303783633269ec8fc9782eb9de5652fae2 +size 708041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 30b2eb94b5..3fb57c92ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4277e4db36c3daea001c03e05bb076df65e47799df0160432c7ceeb048c597b -size 697855 +oid sha256:af88438cb9ebd76cf0185664e81c2702ce9b04f89609531b624b56abdb23de5a +size 716455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1f31e6a9f6..5864faeadf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9c9f1310bb5cf37c5b01e95e680b47947e93e49491cf811087a317b40b78f4a -size 614374 +oid sha256:50771098d07856466002b800ff7562e0e9832802a8d9c32cb66ecdb00164f3a6 +size 631543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f7e41e58f9..42bd205cae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5f139643d87c1c34d765ab1b9681546979d03f5009ab3fad7943385187e5a2e -size 703329 +oid sha256:638e834b4efd6d0443b447c153a93e49554c8cd2ee7b5cf13b1afda5a0d6edf5 +size 719807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9f197172c0..95ca7f7c92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee48b3f1d0d87548dfe90474c975e0a4fda9902eafe3338662d46f22fb3d49b8 -size 621725 +oid sha256:e878a841123ac4e2eeb7861748deffd82a36497fed1da91a78bed475148895d4 +size 635735 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bf6a16cc1f..02dabde1df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9266c133d746421ad4ee5cbe577fe2d4d144b1aea8b47cba4b6fbbac00e75328 -size 765671 +oid sha256:b9cdfc7286ee8256dd3d35a974d30fa81ca17ce558ba8e854a045f859335488a +size 782889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 55d09b8ebb..376fd90472 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce50b7a4bf5f867da6dd3974b48aeaaa0c95dfce55f808d4edad45501212fe0a -size 684559 +oid sha256:c195d7096b57353d5a261744b38df5a54231ac9093b87d07b34a73a582b9bfb1 +size 699803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 95b6d39161..54fba0a329 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17c728ce18d4c3e30cf2460dc2b22c5a23076eefa31aad5622bf3857809f9694 -size 796179 +oid sha256:37837008649d8e130ec943407b3053e4f2a9cbd3d3404b2190a6070c826d6245 +size 826963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 40cc66543a..74f2b4dcc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3045bf0293660f957b9423fbac332014522fa6be5f384f01ddaa38a31004ee9f -size 705891 +oid sha256:a59a42f52cb90b60bb70b1c7fa6a1233435f3101e01f63016ac16901ba87b807 +size 735689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 76c0d3b9f3..f9bf22a3b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f3f76e8a3f17801ba526732638c4dd02534762c6fe1448cb78df45fb3ff52cd -size 781873 +oid sha256:6aa99922c4deb8392477471cb42a22f33f1d3392dd302e6f8bdb2f2aa68c6275 +size 802247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ed292d8002..fbb1739f3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c9bb03430c6ed6fdba47db5e55120ef858f27b9ab715644dab6c8aa43b150ec -size 692373 +oid sha256:94d3963e738b883a8bb7920405f9311ae46da9c749ff9657b7d0bb6291cb3daf +size 712057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1c81c9b5c2..30c5fe80a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f996dcffa660c7b6b64540f23e8c0b762dcb20938a39a259e2eb5f945c3755c -size 799531 +oid sha256:36d98b9ef9a40866a067270fe609bcf2d50715d7e8e5b83ce74f1f5f2e587b29 +size 829131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 867ec7df39..3e6b76581f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3f98a2b9f1cd284b7bb634e5921044d18f23939184fc5e0326e068e64c149f0 -size 711759 +oid sha256:eb40688be427e5970ff386f5688b011e025041187edc468c08aac9eab790b7a0 +size 740767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5c69161ed5..043e8726c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12b240fb28746814aa8d3c6fccf8e89d4b79cf2d7e29448b5c1c3e95c13ef60c -size 785275 +oid sha256:8ff3b0aa09585ed177385b95a321549d2ae7aa0b8b7c1747d52ee1cee8f5cfb7 +size 803725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 30b869587a..a977eddf3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98db9df928505ce80e48717e8e2846a79711c66ff1419bec23cfd9c2b755172d -size 697453 +oid sha256:a999e2a7347b7e33c80114729d2347e3530050f31cf31fedc4393d258f19d7de +size 715559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d82e922a4a..402523a17f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e17a1ae8ba814410d8227d3a55c7301c50cadfcc18068058b1ebd0b37638c6ad -size 867201 +oid sha256:b011cb644da7ecccc7fcd8acaff7efef0d57e97a3e17ad03d17abd804f5b38f1 +size 896407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1ff4859a7b..0ae4e78167 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3609bff1aafae19a5f6ba4fcb593063fad84029f948c2f5d4739e982c2769e01 -size 780269 +oid sha256:ae480ecadae3d4002b48066b8ac783fb08b57d3aa764f1d149190b77eea4ae9d +size 805281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7f47c3e9d5..981d875423 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38ce4d7ab63149d597e7a909a2a0178f7821ec023c363578c464b0a3c7b91d95 -size 852895 +oid sha256:d82ffe05bebea8a2fd3e19f4723c918572bad42f033f25bc7df7db2fa0724fda +size 871789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 533fa0e8ec..ae82e463f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67b298ed972efd13f5f7fad8c7d162d5f5c4695b45922320d79966af488d2dad -size 765961 +oid sha256:acb61f85779375b9eab457705961b93174d93e3a248177a60f6f54a0135f68df +size 781649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2b82cc9175..bb0d26e04f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:593793f2f5579673c7e315393935a15e9aa2b6076eb58114047baaa054c782e3 -size 655971 +oid sha256:0978a67bb4cd9f7d37650da90b8fec23316bd3e94a503e985035e375a993ce13 +size 668699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b95789eed9..a5c5ac5c94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4e8643e6d6a48773c9636aace190e89ed707e894c4067972d99dd4942c5856f -size 558036 +oid sha256:85bddb9a92ed308c6694b57e6e360235faf694d106c7580b691f82a28b1014d9 +size 565338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 14c0e7fe70..a8590004cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e100826e98fe7cc807350b1ff2c946d298e00d21da38e34c8e70b7ac9ddcb252 -size 653453 +oid sha256:bf2b75a0af788da776b1339edc7d6c48c6c4617db1fba9ed53e02a077d25b516 +size 666873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 784ca7390b..6251908a28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a1c172cdc954551cb210f4d04ba320f5b0aabfe1cd93f2b2817c3e1ada7bc95 -size 572686 +oid sha256:7f418532bdcb77ecf3d0868a95292d17c6a5c077ce0802555f9b204f4f8b38b4 +size 584772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3a584c52e2..2d1628df6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c75467aae450a26c301b2784fcf5183c75056d311d72cb577a6535a0defa2fb -size 723935 +oid sha256:fd165eabb8f8baf6e91b5bb029b72107845d320a9c2aefe5d62a18e0f6115ae9 +size 734443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e34e152594..6286bb95a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a219c56d4a7da48c3c4a5fae883e02de9f4a1ac70f08ee16b94e7c49d9b68399 -size 626347 +oid sha256:5dae0d644e062796f5770bb5796f72c942ae7f8c9723613a0754530e3b43edad +size 631971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6548643814..9116e2248c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1156f2833c0774f7bc9e820abfbc7a0f2365cc0d1d2e1d0d592b38b47c5cd21 -size 774225 +oid sha256:5675e5cfcecfaf6fe6291f9b238c430db9ee4642d9b69b67c97ad831e2294943 +size 812607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ccc161c720..293c6c15fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b0fff8cb51d620e076a92feadf7bde97ef6c15eee6c323810957f899b66c2b6 -size 667313 +oid sha256:9290ededa6b475f53e5afd9935f19a65dbdb400e7369116cbeb0ac04b749216c +size 691437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 79eacd3aa4..8312846bf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f66a16d4588b6f72e7fd56432ee12d900965b6461273b01de2096ddddd9ab14a -size 746451 +oid sha256:ef9424811f8be84db2989fd5f5c04cdfadd7f450e45242c8ecb0b411f0cce92f +size 758341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 62c912dbdf..3ad59629d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2291c86fe9e5b6b8da598d1dd4af2bd1e7ed351fac0ae84d2729ddd7517d0fb9 -size 640327 +oid sha256:d8eed19275da41266b4eff5bee241371485ac22befa38fdfe7a828725f96d541 +size 653893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 06eab25d62..907d3ecfc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fee2783be048f803b820aa367b8cfdcaf373fe95ab5b514bf7c28bd8118327d4 -size 768007 +oid sha256:0d03bf4e0e738b6a92473440007cbfc7c878d266a59e4cf6719f18cbab1c92e4 +size 814627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a025a11c97..67cae9b00a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb8d928b4fabea82f6b11fa381727ce5145d548b059f607339a7b0abafb10df -size 683295 +oid sha256:29acc086cecf0e08ee7af2d61c8589d0103c59e3a1d5aa2a8cbdb39de1dff185 +size 720097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b6dd950018..5f78de7a4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:363ff50eae0fce3d73e91a210554690603c5ae36a73d56615fa649f6d51787a8 -size 740183 +oid sha256:6881d879e258576cceb48839556c9d15cdad8d0d75caf85650294eb2f956e2ec +size 759523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3b14856eb9..0fc7cc1daa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a51ef82afff8aec7a4dd27450820a11f550d2cc9ead290dcad6ea09f71a5d1a8 -size 655519 +oid sha256:a9cf5c7db1a1ae302eefb940ddfd643d9010b30898404f55636cb83d1918fb0a +size 675697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index be527b589c..051b75b9bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:586e8b18c9d87d2feab1dd51d11cf5cc6f475cc0a6c75c660c26438681ba56d3 -size 836763 +oid sha256:e963ccc625248424d3e8ec53b8e5a33c9ca179c9ea1cc29463dd4ea7bebd7f7b +size 883185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 528fa2be4d..3f134cd0e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c04ef5414a62ad57af5628ad065fc8217e625e6765001d9075ca9fec2f1341f -size 734733 +oid sha256:4e4d9f06c2d023725789f9ece0e2d04febd88dc5ec7c7f5d1b0a39fe705ca695 +size 759351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7f88fdc882..da25760727 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2ccf31743379879f2901b6cd6ff7085ef159fb800e80863229a9f86df817b3f -size 808939 +oid sha256:104179fbfce12f3e7ef1dde767aef2cf550fcc92bb98bc2ff92b07c5a94e1f97 +size 829955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index f53df06740..626eb89983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eaf3bf34dc24dff469cc9db9040c0919a22cfd5228ece7b4d1a0acf80bfe8fa -size 707749 +oid sha256:b95fe40ad2f6502dc02626c67d15ecfe3826aae645b9efd411ff75a8717b2041 +size 721857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index be31209e57..12ebed830a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed9af69b33da03f27af28c3177577db205f2c78c57ab50fd4634f0765ad739ea -size 661887 +oid sha256:8e34ff935ee2c8d263244a53688926d98a1f350a6cb9df2390b2b8cf34763287 +size 677823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9d681b251e..09a001c9b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44f9343b7672275e76fdc91fbdef9336c2effc2e023054a60d584c1ce340034b -size 576730 +oid sha256:459ae585cfbcbc22da3caa50565ea7d5e863433be5b424fa443efd1d59c0f806 +size 588766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 7aaff31b0a..8ae3c71832 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7e6a8108f166223dd791a26c464af46f96ac2f1a13a2315387fb1153328b893 -size 662281 +oid sha256:40029c144ae6c2e0bc531e678b553bbc0bbe07d36999be46f929053655fdd002 +size 679745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 776d55393b..abd95c6c7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d6434e33a3daa9977d350f8cccda1d8948c2c75dcdaba618dd00c0ec2a4b7c9 -size 583288 +oid sha256:2dfcf5f82347105d57a19dd1e0c5fca8d700b8f173885d25a3c22d6f71b2862f +size 593796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dca0b1431b..7e3403b7f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebc66204ea01392f87a1d13d7c75c6efec72f0147a5ddd5881a6567d3eb17d94 -size 728915 +oid sha256:c201887190c0c4e080eb9a78a3086c889eee59e01592fd6d353231919e6e5274 +size 744997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 561cd6bf3f..7c9120c1b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69a7e1422d729d50bf84a0a2509b9d4b9297272b464b247cf1c54645ee5a0f5f -size 646421 +oid sha256:e12020f48a69fd1836368fc7c9fed32906952068f528e208aa3de409ef02ff4a +size 657521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 461ba30d49..ff4d9cc0a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e0859044da9432034fca7f7d0a82be7c58082a46f9c195426565554fdff8ff2 -size 748913 +oid sha256:2b928201836198954184d583af4ff82a2f5094947f5a46fc7de684c66953027d +size 769437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6d4551bca7..8b9b857a0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df46dc3016b960302d476c97e030ccf9d0b0d1280a1e908c391081f9c2ea2b4c -size 656701 +oid sha256:9779f2e3abd4d936b8c2d4884f989a20884c958861436df07eacd0364f0b7015 +size 677521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 07b3635ca6..d97758c704 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:958b951cc74c33bcf8a91831da19c98b6c0c2a2b086db76563b0651f24a70b39 -size 741761 +oid sha256:3c5de3213fbade54fc17629ea728f54b4f00df5c9e19ec5b919c9b90e349b7ce +size 759717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 32007c29d2..b2ee8d2d03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb52b145085f6e9042ade66c0c08153588390f6be91e281da275e0d48b2a849 -size 649549 +oid sha256:9841ff194b85ad6129e6309381ec2da7c03d812b31683b7ad05d05344cb3cd09 +size 668641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 017ec835bf..97bfd58224 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04d58d648ba7f8a0b421a9812bc097b2a4231962d3cd10566f3a9bfc5c66d91b -size 746297 +oid sha256:b98911adfdc00095963f28286837c4d8220f67f2c3c3b4428c839eb9d5aeb01c +size 769779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 427bc34a58..434e9a8474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff8f903185eef04d5af988f2728110f67ebe58512c2b441f4a16b80ef3d1a8fa -size 663409 +oid sha256:e6d8727377494104e23da1099d1e57bf03dbde94ad4302ea346ad23fad5b3da6 +size 681711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bd6cb9a409..b0c92966c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:087e834f31f6113c9dce85cd96b8894adca175aa99cb8c8c8ee1df152c7d3932 -size 739291 +oid sha256:d186c1283fc78f4085a0764a7a02e44f9b23e5d71cd6086ac33571333949d192 +size 760703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index ccae9cb29a..4f166c6ed6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1125013a89f223977f289db6037da2e9da7abed138771a480802dbb4c3fd0a5 -size 656255 +oid sha256:8fdec581e9fcbe8ce80fa1d3d2a5bf27de5e5e21ebcce048cff0f191629b793f +size 672091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c87329a14c..0baee47e53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e7def3b1490828bdf94dac26d9a1d6553392b7231599528eb6770c8cc7c9a41 -size 818703 +oid sha256:229e0c3b2844279b777aed86ca3b37cdd94167f62a64b72001ed306570ef43d2 +size 838485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7bb361dda3..50002eda28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea335c501610c2442e0c53648e805035a8c78964dcb1c4cf6157e6923d8adb66 -size 728021 +oid sha256:13a5adbffe05aa710cb8b5be0bf38276fade632a8c0be981bf3d9077fed81dc0 +size 743067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1e842d97a3..80765405c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a9472007e252ef36e423409395f795d7c978ae08f9c7a23a0e8c7fea9734291 -size 811549 +oid sha256:720957d2123dd49febcf6851965497e034df39ad8676b57a15c630625cd78a71 +size 828717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 76fb969bf2..66e37f6715 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f293ed3c89cc8d125acce49982a619eb8f526edf936b75c33d95f88e6c1a7de -size 720867 +oid sha256:dd1fba515b827528a8abca5730e85a9386bad198a6bfe7ea6ea39c95d71f8e75 +size 734187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index 707269157f..899769309a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -24,7 +24,7 @@ namespace kernels { // clang-format off -#define TLLM_GEN_VERSION "1cfd7998-dirty" +#define TLLM_GEN_VERSION "10a85386-dirty" #ifndef EXCLUDE_SM_100 extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -3955,1950 +3955,1950 @@ struct TllmGenFmhaKernelMetaInfo static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { #ifndef EXCLUDE_SM_100 -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5b47411e310604d453cfa6925047fc6328b2f28d5c4dc048ca68eff94804ab7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b7089366bf4134d8ce6bfd4b8489b393142bd5c4df2e7b4343339897c514b08e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9df75a6c91a18780272dc91ed956102b43d0fb68d9e482f28eea3f85fa4e8631"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f32468afeb2c8e000bff47972756a806ba913a82b941a95ff168925e78b58c77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7b124e29362a0f3af40a36c20475dbe743e1c3e62d9ecabfd036cf0b49a76555"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "86996021692f6ffbb8009a6bf6701fd201292983f9f32aad10d9bf9df681543a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a78d401bd217c9a4f82aad21226b573d06d7c34eaf86b6787fc6d431f0f829d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "20163ae8b3396aff4456b85a30d3145502670a72596ba9bc59ddad404b94f059"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "8b8d32e9238e23fb77d004cc285973d0c7d0f3be9ca48efe96f60c9c3d1e6110"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "10934daa088f4e17844135e0cead75d2061292a547de36fa9d915ef0832cf4d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c1dd6223ff53db8ba2d0d3ec9f662a1c050b1ff6e04e7c4f90551ada566cf271"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "9ae4bf2ded85ebdd05dbc51893ea3eeca13eb73e7e9a19d05e9108d273a66eaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "dd89385db2d99df26d8c2389f276818af07b06c19641fa14c24cbd5b5dbcef55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "38efbcbc54503d290e02d7b3a7738192ab24df642aaebeb51482a5c3f0ee54aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "adad9bf19df163b3fb197538bb5393c007bf361b919b46086588e2749defb40b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "aae998233f8d0f88555b0daeac83c83090e39674bedc7e426bc826dcb1a6d1cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9ef253110abcd3ce8d187b76b90ef9af876091dcd932966ead161964c033a481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bbc673a2ba77c46135253cc85e857f692758b0c93f5ec645b289ae919cbb272e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a290a662a5a525d4322be227e317174244f2d72247409c739f2d7040531d4291"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b30f3c26c639d655b797a75436a8ba0903ed0ea5959d1182ec0121bef3090e60"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8bcda4264867768febe73fcb5d0395420b8184c25eb1b80e88d551d9a0059d31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "16eb6be8af27df03ad4a5dbc506d4fc1b58d05f15f90fb64f59db0de06aab883"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8798199ea55ac3e9ac1189503b276e06df34ebee40a803a3f59230978c4dca8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "07e70c9e937bab61c3975b4585ff6868613abe93fe230b1eebf44e22bd93e774"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "38660c709379abb5122ba6723b321a6c5d980adb487b7589b60e1e1e7ebaf7e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "358d019b8dd71d48ed73b1c2d3e1c7e2182849668f5a1ab550f439f5f0d64f86"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "ca83926194a0a4bd8084e7839bbef33a1fbfd76e995a44110707f5ebe798b1a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8fcf2f5c380cbabfb1a8749a375c40b70b5d17914333e4257157ef6faa383f1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "86448a78cbf6f94a64f0eba6d57585d7a154dd10a1f5969749b0dd77cf48d772"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "28c6c0ec7b942136a0c1ccba453d844316fd138e3c5c58519aadcfb3f71bad34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5243da5d5e083182089b9c7c21f53316a581e71a1862a3ac0102fad00e25447d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fb53334a6551e74b91da17bd3bac70126d7406af64cab369f7c77baec28ff60e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b258f5a91012907b87b6f43bfcb636bb41d264b093d76b4ede1587720e92766d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "ba573b29d91ba1885161c540b38b3947bbfabed0be8b6b3cd7e68c538fba364b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b2e0107ac8545b65808cbd0f2b809c975a35d1ed65cd0caad08a87230557f108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "7dc632b606ba942122bc8970207785b91c797ceca1be42a6d68dc3cddb3d9bc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f95fa8c512b22fd447eeaa569f8ccd8e23bc7c20043e50cb653a117b543e31b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2515f7e8f3acdd7617ead6785fe4c38e447cf005eec1eaccf64ba7d2d17e6bd6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4de702a902dc3134fb668b1338fd489a86f3e8438bbea67ec3fad98b6737a548"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "95895d582a42bcf05e3b269caa0070caac04aef5381ef9d612529939031cb88c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "0dcf52013d31369ef0f981f9fdb2fbbef0af73ff60b3bf09ca7a3c3c280bd340"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "6e2671fd0d38c82c3a1cf9cc26d377fcdcd610221f9ae246845ba78fb588660a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1718313339b18d6968fa3d9573f093a45f1ee0a14e9cab9e6e416e32c2d75641"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b9b639782d3e1145aa2b840490f038796138d5a3f6ce628db2eca1a7e3c313e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a44aed72d0af6a5f61cce1f54ebe276c187b3a510ce75a6539d04b9beb1c4a23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "be21be03aeac61b732c0c9daec9e6340ba998238f9aa6ea24935b2e93cc61b0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6991687434c6c38ce0c085b3816f50050cec305452a20a6969735bc11675d67a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5a0c9cc5d47865695233859c43abf7a500ed4cae224e8ba9061add3fd31198fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c9365f26b1d650daee7c8239d227ca481c715bfa0a59dc7d3ebfabe8b82add13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0538ac322fcbc3375fff6cc41b6f378c868177004f8c214fcccfe3fa4663f440"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "effcf544ccaa83d6dd99927e4b1e8b3562a0d913fb98466d9eee7bb090bc93cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "10cbbe02e179da155d2f6c73007c39c6163aeb4f2bea85abd702cd0f8b518f5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "579fe774f2b8b4084c9fe1cef32b5bc161741c7c55891de7d3ed37716be0e368"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "636f97ec108cfbc82f77f0b3c7d11b88f8223010454002440edbbbe8fa62ce30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "62ca84248cff6588c6f9842e084debb23a6bd5409fdedb564ac9aacad88e871c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2c2e864fb73d2ab76bde3ff62b7cab1b77dce3c7a8b590715a541ffc1db617e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "92776e4919c2714561875f78fb69561a1111316b2349da9b17b012579fdacae1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "fd2736265d6076de0c99279f79126bfd924639b66d506a2c197eedd03f41fb88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ed0646d6f753346d8fc52222159ef5d64bc5fb2ae6c78d4de1db9eea083e40b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "5942ce27c36a201095bcaa096fb118ad44bf8f70cc3cafdc73cc63395dbc3de4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "d983a647022cbee8f27518b2fdb8c912594322a6908090d62d67358e8e0daea6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "0437878dcd346eb991f12c17d704a7208d1ad76c54c27c655c268a2d68ff7ef7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ee56274774fd1945fbacc3e3f6826f7503cd6c33cbe14d776591d6b3ca89711"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "518c554f014765bb146eedc665c73d940a1ff9987660f89a89c813462b5a098e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fc0c3f18883c2b576c12246843dd524b796c25ee543f8fc5bce0621f553bbf0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3c575020c252fa5f67568bfc83013ad54b29ae0c4979ae80399fa44cf9ddfef5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "1d36fabf4dde0a6e1aa115e88a815ced312cd648fcc75aaa85e4d351f6d31995"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "fe7a196d57fd3d48649aa532a43b18cdb1a69e497bb76a529fb65b18bbcd0623"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "fdb3d136480f05cb3a7cd34f1ea103e8ea8701ba66219287304c8832caad7ba6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "60950a921ab7ded09ff8e6a17a265f50cdc9c40b814bf41ac8214950e6d212e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "64ba82d2842303aae7ff93c6ab3c90d1d1faae1e77bada5bc211a44beb1b7587"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5ef696589e3b8e05c044ef8ba2afbeece8f667eeb593ef2f891ba365312a366b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "524c159d7a8741d17e7bcfdbed1dbe0d050d0719df1a568c23f8005c9f528887"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "17d93d9bfc418dc3522c2a03aa39cb8c6341a6ea4304314c4efa7535d036bc35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8095af26ae7f0629966cc3dfd7e20b1e5706b12498f9a4883a26fa709a5c0e6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "2e90ef5eac304de569b10217b7c750c299eaff495bcfe7c375a9768fcc7e2784"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "f6aea2015acbfc1c96bf8deb895671315f7cf38b581bd8ed2f98b15943a00c19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1ee2005075dd87f84bbc940ca7b3aacdd45ecb91c610484e609c4115aba32d14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b222da066d17fb20fb7f139471e2a1131b490e1e2ed5ed84d5f2e0062ca05c35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "5b0351db998ac7420d7cfa411ddee1fc9e9a9351bc7190f11df046204320ffca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "8397e53f09ff9c3ac4b120158321df0072562acb4f56ed4d2a5bc68852c355d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "828f625b7714562cce5f3c9d6637597bcd5bf3a9f6cbecbad0c625767ed186d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "46d8677c271b486b3f4dbf894d47764ddb1aadc8daed907853233722ac976294"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e873324efb13a266ccb8ad96623725ccea53b45cdb990709e15dcd7f6d018a93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "79ee5304232eb31cf98a4d15e45257a61c04e8ea8bb37797c2b74c22de92e76d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7e34ac4ca2daa6f66bd958537033e4a510ba34daa8a21305055d7ca17460376c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "802cfe7ca614a007fe21a85e9c640963675addd8a502f85a6da0da83942b1dc3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "167f80a3acf7d725e8230b32cce19175edc42ae72f61ad1fe9a8aa65963ad72c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8b8221fdaf6fd38f3bc7ec734ff9bf15186f0155d0a2931d30099986f6d342cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "fc76d3d43d56957d195c68eb5eaa7c46484ef243385b213958ff7cdc95e2b422"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "edc88278d79b1b55dd4d92e9c07638f33df6d3f1d4482682430721c9a9793a2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8ed49bc6f3b98c2c262a352c31ff66aec095dad031fd59203eec93804ed23141"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c2519d882253ebd921c1aa547ae8881a098386018f544b22bc19efb245ebf34e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4f25665b75abe1c1484bfb292739476e1b5b963079628d7c0d3e32c78c275e84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "193e7639bc0d1301c36bfbe3aa90db9b64aac6cd8f3e9e6006afae22be2ccbfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7e7957b828a9c65350ea28a8119686b94baf79051694546b5ac2af93a5890873"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "22d5f4ad84b05d17ff9a0e6c937ca3c5115a100b18b79981834ff7dc1ba68c15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8161590fa51046fcf745655e140bcc576825fd001aade4ba6ff72a565dcaf29d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "24b0d2fcd2989467338f8cf1b9f6538da85cb4cd196b471718c178f79c6ba756"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c190bffef51f34ebbc3cca535ed73829aff6b3903123253985572af49393bebe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3f39d2d7a36ea50e6e9588f3512e6d6aa000e86a1df27e4a99656750ac11d70b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "0328f2640952ef215a71c410c69d6f04df74070e12ca71bd7946da3dfe0c2048"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "bde94473a82675369806c48f91d3eb168011b1d9e9d2e8c55bfa8b198b215c05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "ba8ee754bb9a51714832adeba89b60931bfb803b396695bb13b0d6bbcd8a856c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0dd2ed3daff3e2159b23457849ded014afef21d25a31d49e42ed3a29ee5ccee8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8a519feecc4f95cac683792e6644acf7276b3e47b9c8187e3267120293f62db8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "77e9a05ae49524400de173351cc540344626f889e52167330b5d6af9d9d845fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "420c40922c626a4ee1aa407c66d20b460e64be56b36bd4c6cee0586f7b59f442"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "996f4361436447ca1ee740b864aaa940186e217c56d2e099f82feca9d2aeca26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "00c6c39e04b123ed7942e9f67a77e1d6482d6301736cdac1065dafc5fadc3cfa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d50a6f31139607aecfb12838c3f97ef5879fbde91c2809c74d1607e272bbdf7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ca064a0615ed7f3c835e93191d27419867e3174274a6ae4f09a5310cb47ae5ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "70dc49fe4473a64c777cc40aaad768d3a9faefad5a96c99011b420e993c13b39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "66f088cb381679aaa0881be1333e8fe959c456315610ab068fc0794092473abe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "7c4f6b5b4ac43d6654e4118ca793732023063fcdb6a4b840ed1986fd9bbaf761"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "fb487c51a14d3c30c9376a37e28a0f01358ea5562307e61a342ff47403789889"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "dddf40cb478b37bcc092d561527ff36087882cfba1757085f67ee522e606d30e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "26da7ebc464ad98148c2e5ffe8bf3df0810ef3c0dfc962e994ac9bd466aa711d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5ea4c615884c7546309a392364e2266fb597b8aab6a87a0fbfc70b41e5ec10f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "61b60e1e29eb3336951dad2dd824017b14ae85b0eeb5377ea2ad7a432a2c5886"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "4cf1cbdb2b5fc7d7fd2b0b29b83101dc1d860211af7aee60423102e7a479c98e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "3f7a410b5fafaf52ab39ec0ed7deb3638ccea26892e73ee418638859256ac5e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "81aea9aa8dd530052ed40b5907731d665bc37b44526732391cea0e7b9f2c4e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "91bf9754b00cb9336e2f87362563b866c6d94683598abec081e8d750c650ace9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "5540fb93ef1540841e1d940b9d84254a048049d8cc1b9eb132190676d209962d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "ca83d2ba56bc7cc93fabd37de657760583cec2f6d0aab03dbef76e980dd695e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6b9bd206188c87ec7018d6c45bcf982fe2884127b57e12daabe0cad0cb01bfd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "75ea1fee8d1e352ccf6b0c43abeea70bac55efb71d8d15b5ba4795edab1efb91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "e7c2e111ab539546a7c89a05b8a54526ac769295a1a5b28306be8a1a2787e488"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "021fba9584d70e8c658f4b0590d0d58edf6940c10b4e80b2dea8c1ed90b77398"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "21e5b9c8f7eae472620af87b4ec095d762d8ab8758bd4bfbfb0f8f3a86a0df63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1f49afd519e45a8c3d122887e2f75ac863d9fa16cbbb2fb15872660acc05cd69"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a7d22360c4177a9e943d97cf4e522cee3b668fd93be6783a5961517e1a1efb15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5bc59edc894af0a0613e2c0fb6926e3418d32eb78fd8e185a7d54697a2b79c89"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "9de6bf8f47856e3bdd5262b6f8229f44bf8fd274c6967020c7f65440c7ab6148"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "41b50122598771e0b37801ab74006bae5567c3a8a8866f5816b7252dcf35aa37"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "904150933b5d99214c7243e16625bc74196d5f34f3327fa27ab0a618e543b2f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4f78dd320ebe8a3349fdbc3b5802f0c4f699de0080073b7724e5628153b16242"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2e84a469796f97d78c1defe38ca0bcb64583f4c6cc12971df9516b31bdd30f07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "dcac67ee44de1efa866c87f9f95d0cf53232cc799d91160725db84bda2593313"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8a166ae2fa5edbc754ef9a8da8a9ce309c836b0f8ce4da6b300ede9ecae94997"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "e6a2579a90e389c1ccb6e10011c01aeb960b3fbf4487bf8bee0f25e48dcd6638"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5d9e623ea02a92f2aff11798892024eb0363be7dcc83f0fe5e5d86b8a623494b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1f91045371232ddd59e6bc3a199593b6c05a236fad4f3f17feff63cd2b3c625d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "efa83baa0b291df32029888acbf8527eaca58c64228ff1f76b3529e553a56357"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9f20b7ec82ffd51066b501d6d296c795952185c848e4b57c601de8a38da8f366"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "80fae3514b2b9d532188467868510c9a7fce011d1c36494eeb9f8a69e4744208"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "76305d48ff41fd4dfe333554131a606ae5829fef043985bfcbe44ffc0ae205f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ae2abb326b8508f8e6bc44f530e25e70777bcb6eb53e866700216be7b3f30e13"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bf4c109f34648376192f6b659f8da597e40c817e5bd55bc9884c9156f3c257f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "06ae8f612c955202f285161e12265ff46604e5afcdf98aa60a0fb272d61f4559"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "361afc34bc642b5cd4b8bf94a24b39578accdfaa6ac8ab526f52c87706c9afca"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "38af2450832ed2d200d7ddc3ffc9c067bb19b7cf6dc9951b8bad2e2c32530cee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "46cedc42635b3ce993ce00e39d0f2ce173846c1cb81590459f9ec7b0ed86a3eb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9db673b129743fad5a1338abaff0dc46b611daa263eaafd5bafb9d942eccc279"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "197d009e2e938d3b41cf9089fd7a59fc0010bcfbe239db25bf4f0fbc7e2fdda3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e8716fe0843a121a911eacf67c74198f67db4ba84b2dcc7fa70c6bdff4d78bcb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "33d90e7cbb3f6880e9ff52731a699570262c87aa61670a253dc45e606d079ec5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "85003f236af5d5a8ceb8f5dd45ae1a01935c85d56ff84ecd799b532438b43ded"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8aef07edaf46638b6b13080f89b1e651ff2d503885f8a63ca30d5420d864130f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e7422a5b75eb14a7d0ade2fd1a5638dccfd0b8eaff9f1c58f154de0d88d1cd48"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "52595650dcf4cd06dcf4e67360757ab2ca81dfcb9b0b29c387c39422437c6fd6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "de8b79c7359b6ede2013e1e4236ac646b3209ffa291ec685f7d35abb6674f5bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "29b9a0927d9b18c43fc4cac05ac7d5d61e05dcee72d62cc3b16973a14b71137c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a2571a10e34968f91ed7e0fecea59479e111666b44aaa6429e3569516d8b2576"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "2ef5eb471b481e517cf888d55684cb2180b792669fc28371a3b5f4937dd0faf5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "2e7ab27ebf3ae5f93d9a16225204573972135d5d0d912e1792bd69558022c2df"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3af88a609f9b2ba23e00d68073d0859d291fc8be34e0bfbe67b9bcc2b6bbee6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "53f93dfeb190bc1471fef6276b5aed198b59f94283ab4e27fd1cfff2f9cb96b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f9511381c2c6169ba174e1c40fc11def677aaaebf8f7efc700d3f59ba064fe93"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "579027871b5790b4c385318868455edb3d75a031bae02ff3563089ee5db64bed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "5effc658e14b8272c9b2102c264768cd2457afb2ea6780711f44988d418c8a40"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f81d59d7d0fc5c847f3f2af016a0ebabc35a87289753a1bdcc89d1747e4ab03c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d191efd39f2c9d2d70266d90c68baf939e10e2d1da565b28c6d4703190a79566"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "95ec5f71deceab32bf168f14091c218044335bfa80cd330c8417ac310d65b57f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f212ddd3b667f69553e79bda97e02a04184ad977ef570f9a46b63d081dedcbaf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "082b38a3733f4ddaa3b6f20cfc8d03f5ab8ee8fb06999b3d5a5d222a67fd46bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "37c1b0a520952303f4bb94376ff0afc67ca83af4fa4cfd1cfaf7f0245ebd43f5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "29bfc437312591710498b0ca6fcac9801d2165293463c2c9e3c7e0e4d2795e07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4f4782ea8c900bfffcca95398d5abe9793d8ad246c9ad0199cdd3b86016302cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f152939b13bb7a932aaec1d268c18b6446a57d734aaccbb77ef7af2ebf923c2c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "67f35e6afa05177dd96955c3a75271e9fed0122a17f70259f0f0314177aecdec"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e7d9b245f580371b9ab723130b416a30ff7b505b90d4dab523f2303950087f08"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b75d12e7a21e2cfcb8667fc3677d64451289fe77e697fc9d5a18fead4f24c177"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c2d30c8fd06fa7241bedc8259f17a42a4b7636cd7d7c1d1fac8da1628b980be8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "74e0e5ba4516436140b12edb8fda74eb43f0c34ea2d72d0f5cd941df297e0c7b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8c13cc43896a58dde2d83db32b8283bf480b70487a57916386a4463e37bfa2d7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ca8d89903cefd34b918b647492d08698fad010c1e837895a0cfba64480a18027"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "94e5ac504aa7bab41f75f327d494eb55b6a7cb1c4fd90446ee13346eeb0f8f2f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "afec6b265db7e459386758d7772888aae6b258259eedfc80eb45867e9651434f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0e28a0c9a5dfdd3fe334fa71c1c7702d4c576a5101dd103a1d4c28b88b36ccd3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "07f8cf4d6812509698c668d0b1a8845bd9ba04b5cc7d71ecd63be4c8def30e6d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "28c0555a6a9c9da8a33f01439022e16a1b8fe331f05a91e47ea804bf06b33167"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "50ba99df1b60621d973bd28bed28d56e7a0beac1e0554c19462412167d12af0e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "38c2fdd89c53ccef4f829d65a45a033bad3ad908ef484792f2377a65efdcdc4f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9d2a5cff0d66186d00b20ff6eb5a18dc6e742af07929fd75955f5c98cfe5accd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "7c342934da232c453c98220d993db7a16be0a0ca494959347a3dc69ad79b5d75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7f6288937fa89ad88e286653f751a440de056e06495a2ada7446d25189531c0f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "19c19e69da7a6ccd08ef9bc3e1ca8a2203b60c1f2e646e730683f120f4c85787"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "889b7b8c407a558d4491fa5c198c022a46901f934fadc8d76657c11c76b755d7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "302cf679d96a70ebf2c76e1daca2f9ef3077f5afefdb46d8d16e7a7c3001b61d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1bb96c7b43e5a95475b9bf5d79d9284652ed92110bfec004ab95c3dad82bd0e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e7d1b275523884e3c7679ce7ffef231f98898c4f86975cef9032174508a902b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "86ff442411224ff453dace8d88c751ca18b1c47c4a2e72876d8603d8f543e762"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6fd9ea0e9cc06c7ce910ccf50dd664e01313e8a05ab10fadeab52fe0209b9b48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "501e1c729c7e90ab6c5960ae1a63f5cdc0c36bd7468c5b6c10a515240aa5ecc7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "1b8c83a3ffe081ea4af76413876f8e3fbee3a4f83e34671750b0e417d4493673"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "bc5c13aeb31365bf140fe819e09e7cf763b43c4ac477eefab3245cb83f21d95f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "847c61d0307696ec186e3b0279a3e39e713d17074c65d88862df75b0a521d9d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f5e805f2dc18c7b65dc031a73b42566ec7c8871ffe22c3438edc2a7e246c3815"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "22ed279b98ba2e82d0d3723c4136ba54787652450aa7d95353a45f16a073b091"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f285cde6af3858a799c21e41cabe8e4c2e6ed3e1c651d9e63a47f39787149fed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a766ff33970c05167a1781dd6f4cf90cc3e75602c6ee7de25f847fbc3d4a3ad5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "95420dcc40ebc52687006db8db030e6ae1691a6f1fa69dc87937b6519cdfa1cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "07ae7f2bb2336174a68f3fb8eebb6430f06221eb440e6fea290716b0319c915a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "e24e929800977f2351069b9a018ef40bf0ae852f6fec45ee91fe347b0128bf03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3809bb51089d63d127d3bcce37d3692277b99cc91893bb393f32f0c87c9abf3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "5c9a65079c39fc1ad2d386686b7f75296aa8dd6a1d6c57990d8cf67f37a9b8b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6e73cd31506f2bd8ad56a1420342139968d29b45bf5a05c4f70eb9efe94bb1c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "6c527f9407dc7e9180bda2bcef6458747f695014d5138b9c49e667587224270f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8b00a4b89aadd47237d189837eb9c4dbb73ba840621fce85dbf8597537224a2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "295e8c1fd8a0c5257b6c1fe66bcac2dfa3e8c1892b97a176974e64a82146605f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "543a495d4ac8d6eb20813bafa08245dd42123c31d89011cde3ddb8f13790a411"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "93ce0bc96a070735a07db092a0dcbe5f03a983ccdb987d9ae384fcc88f315f79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f5b82c6a0319851b776d4dea4a31b5a6329fa034bf4a8a2613ed93c028dce2e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "f49a794dab421fc1fbcc8f2b8f51392a4836924a16c605cd8f62fbf3e4f4e68d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9d664f84de3c65bd12135d7ad1b5fc46742d3c71c7a279d45f3037e36d1a738f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5c63f0585691346d7bbb27f15c0055df6fc18c34063f1d382f38e60ba6e48945"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3d916b44abb5b7a644546c998118f8280ba32fdd96daa8e6e796cbcb09d880b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "9a8ad47686589f0d509ba98af340fdf3c10b7048f3d2aa7fdd839de6068d62c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "c430db1dd88727dcf2f1af52b5dc2825acc6b289e8eebf881de05981b03d9f67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7739cb22b4ad40fee6c2ca30762046c58dda5397d1905c765922d2ec160f3bdc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "66ef8ebdc9aa6f40bd122811eb8cc5bca474d57fe3f7c639b462b584ec437279"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2d12f7b099b65e9603293ea41a493e08eebb846025b0a0f78e0c52a1404b1d1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b2971e46fa47543a1c31a6cbdf17e6dde2548129405cedc95087c8c0ee709477"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "90493f0c2043297c705cc967a307d3a13f4101566e36022bf3a98e1e77c34f0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6f76d193795a910e968b581f8af850cf39cdc7c8895de4f6caedb0e02bd3bcff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c43185bc36cae27061a580e8a8f8441a38a3528654c8962f9d561dea07743a3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "5fbf3842dcc2f4cb791c62acfa28bb8a3d34f355f666a73e541f812f08453886"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "bb8f2fde10196f4d688f29db57c1202668332da6f768789778f3bcf50186e9e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "62abc6e8f8af98256d6e4a7ba3768d1f490947cacc950d6f00ef423e42aa23e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3d53633b0d5635a1af9251385b13df8bbeecea5451440abce380281557832d8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "eaca321a26d07192cefb80e159cd63ad157976371b1982f6acf2955dbfa98d75"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "679e5d30c0ed420ae0a396e78e439b0dcc0e045be2cdc22e4b1f47a0473c59e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "7cb8b04af366c4cf8a0693ff5fc751fd5de98f37e9db6bc508e59cdfaeee1f71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "23d011254ab47f98742a861b10e8d028fe5bf1190ecb2be9ba5751757282b151"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4119ab41c8d1f948d02c525863f623feb6404160a370de0b64e6aff1bf126f7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8a0aba78eb20c6ee2c284279311251263dc88d4f410f0e855fed4bc44b46418b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "225014da5b4685ce6f004056df6d7c85ebc2c63c858702b86b365c4bdddbb333"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "70ed84028fd356212086ee598d7828b15a4bc347aa0e5b9dc8470362f1cb3406"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "827a42264269acf5b1ccba88f02438dfe94ba1a41380d89771897f1834d950d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ccf19f64203259ead7544d8b3ca4acba222837917c86b18d6e71162ad8dcab16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d1336dd490cd288053aef7ba9f543afc7069daca6a446f924df8b3961ca225f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "c35eeb4dd418237d08fd1c315dcaa2681c813485710b4e71b6a5020372d4cb8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "df4f9773c7fd35f3304b7907ddb78b5b06118b1f52036404b50f85961d6c0a4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "77a4fbab839b7acda2e52b3ae3095a09563dd69993c7b812f58105e9f99af421"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e86b17c8406241ddb41e3440f71a3b7fdc183dcae3ebf5efe3b8eaac37b08d2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "8a76cbb460ee011490e1a1d07c2665799b8e076e1b5bc68fa70a68678e8b78c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "9ab35fda78a1ccd2337b34bf7004cad5b94dbd036bff3892504cbe13dc152450"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a7fead941796e9ecd564ed80485c0d55b1399181cea15351e5c6b46c0f281db7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "af8a0e43fc46d95beff2a4fca4413d4209a6e4f9a7db03f9610fa1135a3d23f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b7c0bab5b220129bd4781c9ab59a99ae1017456f53a7d8c3d403c6d117c5df15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a50e8bc70e19044df365f04a56cb76d7c0456f9ef21471f9aebaa446e7e6e8be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "02d2073e1c1bb327bc0d27979816e316f97986ac0f83358e079d5e19cfe7eb3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "027a1158c5be075fcd09d563455cd4bbf01225bdb683910a4cad21b0a8714ce9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d9f2c81bd1c17e078aaf1a195964725a3a2862074e931ead958d8ed9ef5d6a5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "717e1753371a488a3f3bcbf09ae387812dc964e8152bf1a600964794f6779d84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c228e917a69a254e1b3423cf156fc4e4d0fec9d4efeeaf4331dd139d90f6b1f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "306b878a14760d65d4be45128946dcf23a34b491e752f31288bd331e70fbd2d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e0fa25d29ec805ff590a1152094c750f4131f53a6444887ae5c2269fb0a1698d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "954769b36cfff21b465b63adbecdc7efc964398710d6f371a6dc52abe21da50f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b4a25391cf4f8f58b31ccfc06b9247329087e6ae68bdcdeb63f39a79a9028a95"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "dd478d6bc7a346e9f459b110f3b6a24d0d2250ee4bd6d9b9ad3782c74fd055be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c833b46b17b69c78ea74fd2f7e5b7ecffaab95604939b10b454e6e804d695824"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "56cb78545bbee665d213d2737a3b45e124f286eaf71801d63f3d5ba62ec66c0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3a548835d50aacb4f98174957e131276376d3185b5091849df030018be4419ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2256b7371ec2720fdb1c858793551ee67607e892f6642e79ffab8af0c8eabc25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "94b82ca9f8b6f255abf72b8d46122de3eb9553d5caed3b4fd327349bf6897f4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "799010918fa5ba58f983ae8ca53f4e60af4e2deb678654a8568c2b93ed8b34ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4573af5162b3d8a24be4244db8c56420138eaff2e835de70bc46d5730c1f90bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "27c98430ef3fe9445e64f00ea22b27fc980809982658d383d68f55e9105c2bc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "33e3b1d4c792a01212f118fa6d997a189366b0d8ba093a838c5d5ebaa7dda2ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b96ce7321f01ba6dbf0cdb0eb0e8a2ec1b544ea0b4f02abc9eec8705594ceef9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "55a9edc750139519eb2dd5a47548835cafdeea6cf5ee597a0c5dbddd3043cffe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5f38d9ee728b38ecc23868f7e9bd080de3c10654f74625634342556fcdb428f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a45069c3911eaffbc094bd4ed445a714d7ab22511d9f05b5cbed3f829cf8e345"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6935f91ed8309992a7c774395b1e069bc5009ce9811e9b9bd444ea4b42f38d5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a2337c27906a152692485962fd957e42b9960a71712af7c0381b21d85d34e37b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1dddfaa7f2e673313a36d49099cf54d47ddecf26d3e9b2d10e49f92afbcf5a45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "11931d8ca84912aca9c8f77c5dd130fbaddaebe886314793f424130013012134"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7f3550675a84b7fa7f8ac65c09e2fbb08b198f080fc697650602f64a73f0a353"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "0a56d5bbe19687ee697a0fdca860779960d45711417734271e9f66cc75f843e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "15b89afa10f83397e0fd0389780df85996f3825917bf0a9c84bb2c0fc9f8c922"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9289751a836a0eb6feaf7817cf1682542bacab43262fe80ab0ccce7b1463fd6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d50f0541a7b582387fe4a348005633f1cb450d7a23ec3b45173fcedaabbad746"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "392b7e135b664e9ce6e9de680e5c4bbfcc12817cd93695354c8d1c29e6cd0d6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e6d1bcdc8037c24a52f5732957c7f77f036e4016bc09146b51496561fc1889c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2112997932ef19648491910482621e5df71a4cea088d78a6aab80ac101fb2420"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e4f3818fe077c5bdc30438954345890db8ee239b9f18b3f8336cde5bc7dbe91b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "7c478299e5687308661b7bb3ba1b82c1af916a5c0a7c00a54f69ccdada82d59d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5daf04d71d9f33c96b5b4b6129c8c8e2d506ef1b077d9fad92adaf34102827bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "954b436544631eb553fe832e1db0d2b56b85d085a228494b573d15484891e5be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "cd8d2f53f25c06859d62e5fcc34a0826276ebf3ed7611335663e3d4a77cd1461"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "99c4de1194f7951f4d9d6dd3a3874acf9d18f63ea53daa0485d69ab9ca29488b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3189f4597e84a2894829f1727222758e3d1d2e7ceb8f45ba08047dc69ffa1df4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "073ff446f4b5338974bb21a788719b7d4a51fa3843c9ccf19b3a311f7e9c8989"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "aa8a60fd48004cb228c0f32b42854040b59db2c20aee3a588be2a78cb234d079"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "66391a0f06103d623300ab27568d5352ce8ea1600b824229722caba030b792cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "bbbd1af9b155bd6c1a5a26512746253d6d04f25f440555e862860fb9c09a03f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1dac9b072cb5f61dd182bb08fe985e96f2b1b6685e617c3f260dcb4d4c61da1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d63bed985c24bf05b567d60de7180e37964313db7972f3daae2c454c4f163017"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "501acfd423627f659bd8102d2168d9bec4a263a71a2c28e267aae69e8f0d1f00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d9c54541e4c67369760458f2dd8372406f74b1da4ed46f3d4dc9a49adf494735"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b7520a4fe355c7d204643726d3d2bec6f600b8df2d24af5f3695c646d3393c3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d9abe39e4d18a6110bf5cc31c0be2c13f308d58e7cb5f4dab4cdcfe69c3a3c61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "8915c66b34e074696cc7dc35f2072b5646ce0ec43e09969966c0a570e085ff5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4bb4736bacce4a94d5d79c981d8fbe18d8181ce3ef4bde942b3ad3d29124ec2f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fe88d5cfaeec634602d1af1448313d12acd7b733ea08ab1dac3f6cdaeb947036"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "767f7dac85828ad298c964c8b94198a22b441fc2757fb1b4bf2c49e0af74a24b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bdc863c7a856bbfdb451f1a56cc9b56f488db9ebb9843a2d71999a4caf5369b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ef5a83628bf28720f58775ac8738991d841cc99202f1a889476b714c50bd7215"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "330b2b7a2231c787cc1e562d6d0a1f1917355e9e81ca026a7f2afd6322c0a703"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e68bb4e8857f7da93bc2ea5804e1deaa1275f663236fd2e21ce9f474776fbeaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "8d0f24fba0794c37ededa81ae01f9eb4484c178ee546a8f984401cb075ab9aa9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "23999a71b8d498c5b21d669a8f79ba5a440b9574b6f32d7b18e21e584b504d75"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "46d5ccfad5734e1041f47b9980549bbc8e255cf14ed8c9f26f9209b688fc08e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7daebaeafd5f55c643a50b8ee3ba3538f3e9a252df7977128b712a5d81056ba3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b6a9ea8762338494b7621125dc42759fb78f7dc64d327f4079b084b9b9f0b045"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "de2dbab88d0123044b93b9bde38cd7733574e14a59d14ed3bfdaef5edba9d1a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "67bc1f58f53c8c8ebb472975eaaa7c039dc36de8919448ca4f04c11241fe9c9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "e288f75816bca281bfe6e6edd6a94ab5c5106bbfe744d7051f0f5c3de48a0f17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6521e4296690aa5631408bd8d720a8dfefdb02a56cf668adadd35d2d22dc3c93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "7452f8578e23fd69af8611abd13f70a10a069dfa1ba5d45c0d7edd24bfc79eae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6537c6beafecbe61bf6692854dc40183fb72b1f4965e30e1651f7bd70a9f710a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9a5b563dce82ae312b764fe694acb8c8bc57992812a7c0221bc0c64b0f34d51f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "65341ba308c98e5adb0be9102711ecfd544a0963993c50401abaf9709d0452fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "992395e22c3c285f470213e04545c95b065cfafe31452616419107875a2cc030"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d43ae33ce94b9544a1d8b780086beb9d4956ebdca4baaecc672494cdbd5ebd3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "96452d7d4592cba1785097e4a2cfdb5971884a91a6fcf2e8c58e04c00ca539a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "988ed42b6c2d2c65cb39829835c214bdd0c2e137c3410ed733a81d86f06c213d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3d4d8bf3d94bda5159fba579ad149b2b421f0815c10559be563e74f7440707f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3a517ff4b3b4f27b6d77a13c44d01dd8c321b79b7b5ff71aaa0a7df6afcf81cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "88b7f48ba475ed959a22dd37b58456625786605d56b49246427117f911b3445c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c00bc61203bb2f0afdc175ca6675ed3cd8b7d19249348f10a5d3cecf53403c0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "217f6ff2c089e93b8e61cd1c3120ce0c479bdf5fa05024ac4be2f2b8be068e28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "38e424b843cc0d266f2d5fead0cd42c31f043f95e9a8af1816507e076ea563d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e1fc724783bc64d19a5915358a20d419cbac9404a3acbad1c7f29dfcfa44ca2f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2ad276b7828ad9275d95fbf2030949ee0b44d2fbe1f7ed0ce3ec8a0a9f1afd77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "df5bcb7a22e7021432e34b51d12ef7c0097cfca5e5e939fdd885485dab6820fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3099889df0a453c25724d3cefc1b630dd450b478f7abedf4dea718fbcc01211e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8cfe21718155b0c679a2589f15bb6409c3a71a06df9cec3b8a3a2521c63f91b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4ca2226dd419d911bf7f20bfb6bd7c42ba50fab24d279c3579c7cdde99c9adf8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d1952b9354abd02696305e17acc13466c887ab224efb958fa15950b4fd0fb8c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "22016da35e011832baeb554ef0a128ec94f199c2c82754cf67a6745d2db24667"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "abe5dc5a5150386f8fff5706e5eb54834a59c1c7d76194e123ee0682394a1dc3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "402a08d66e7e203ba931bb5b797007bb767d0d73719bdca0c578f8e7edee8632"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "52a5cdb19e61164290fcaa3be987a01caef54317bbc7e7c8c1ac09909adcca34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "69034e00377748c5002b501ff57699dad672241954de86dc51fe4eb865beaaf2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8658b7ee552dae732459a3dbad7b02d80d0003b13aa09d6a4b7776796ff16394"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "927af08e17361e71f6893694a9f065a8f089014931fc3da4a9f785f4b83de87a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "59e754dcc78980ba6dea49e0c5737a502d2b66ddbfc2a72b7a60d693fd15eeec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8945c1520d615a75328c052bff95a9e1df42c1c9486c9cb272bf8c80429962f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0d0048adb0207c44500f29e25c844779acf95a41e12292232659226c303886b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9ea2d6f49af9747a328eb0d3a912827b3c9636f3ed7f12f4e09629aa22ab9d91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "11adfc33a92d3a9d1f5a566c8a240cdbcaef7820da873397202f4ff630765e00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1faa76d876c4b7b15c28e568f28914d21c86a13682e8610228adf5ea3346641e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0a66d2924211f4577d1548f2b396d1993ca5a5fe1fd8e4e364ccc31220e6b038"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "fa9f01b236fdde75a05c64dce25538d5406578f48325e942078bbae16ee09f45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f7e6ff1f7d5ecdbc0d62a97007c5c1d8169b88d855d093cd48a8c4556e8ee0ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "12fd137ca0808760765722d925425ad9cd74e828cd6f272b52a5e6ec0c97c255"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0f2202defceee5568028d261d7faa7309bcbf13527a6a785781da527303d108d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1d49761891c5f4bce6be33b28aa0c10f23de5e5dace1b97b21baccd3014e0db9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "6cf049e099893fee38ce1b9e9eb50722f060f9c1464fba77de40d709c91e93a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ed2e174028cca706536461eb37c1e5b300435b5080ba6774068e58bd2078b072"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "caa2a035f6ed04eaca8b0f55cab0ec3a680b4aa33bbc83954036fd3a6e4d36ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3a9dd28177e228b60888abdfdde7413f32face4e8ec2457f677ff4c98eb2a638"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "352b333b73fb60495d3e13156d5a080d707d657bf036cd4b20b50d9fdd7bb380"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "936318ba98f7135795c07b4b0c45fefde69035ad59a03599d057cf1060fcb1b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "302bc34e5cd05f4f086c4f5178ba0097c1a8294193e6718165f8f153dc2ca9d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0f4b2649a01fd669f4d9cd69046e013313cc3fea7ac6cac67d373b9c2f53c882"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2574d2b494abd3f35c226b16eea5419f63060ef310d08b1c33c30f9339bf7402"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7008671c6065695bbae528f451c353f3faccaf8053e0c536857fecf55db59fa0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8b2c57c9f8459923cfdf283231c5c2da9da2b9397ef0acea7dabef306b9e0abc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "27237f30bd090ddd928eff0270ddc43fd0240412f4cc3f6253704522874fd0b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3a31649561aebc6f21e84ee6be9e9a4586980a7a61bf8c42b47c6793956848d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c247c5c4275e04e635ef55d00f4f5c39a64ed6f097bead0a20c2fa868ce3c51d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "570daf81ddcd94bdd68d25adddd9c9daa9d984a404fa8c1c59c68ad416753f88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b260ccdca25064d939caa56af90722f7988317e3fd630960fcb00832a5e4661a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b60aae0759259752cce9fb615d352e7a0f66360ecd383bcbe42e3e1119fd061c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "4734c3647fb0d0de4d6f1538af0b446935de5d092d3f1c3960031aca5b28e90a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "350a16f68da892f5b9ff2515fa5833ca311b062992b5cb08a6b0f6fc4b055a11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d3d5cebc9a9e9e5523e7e7d4ed353b5ab7a6d3afdcdcbd682bc71c8993a1e282"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "079e5a7f85d1bff814127c48ba5b68e02845f21b0dc1c95cc56e45edc20f3e2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5ed049e22562971799b96ce5abb8e0d5c82973292b08cd0b3605a2933f403ef1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "48b21e3395d279f120f7f0b7b2f24ae78d6e110cbd913715115e9e37611780a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0845aa36667ba6c0e6e82a8b86321d39ecdf67bad09d9087edbd6b83b09e76b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "899b66147d986323d1f518750cb57e63e35053a214cbc88c5bf620695bbf403f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "444dc520df8827b725557c3029b49245ecf96bf40fa367b96e009d6eca19c0f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7c41edbe1eca9d90189252debe52b8151fbfd2d70e2d54327692a6dab1e773c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1928b8c1cf55a496f5ee7fa1088479b2c6b956240e2649de26959ae8e548b96a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d1e85e2968f42926f7be2590e9e33bf5197a820775ef35465b7898c755994b17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f4b564b65048baee571c028ec6c63e0bda1b9ba6fa4f7852b3d5ac6d56a90d1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4f165f19690bfdbffdae981e5fd8e1714691f3ff771538fe2ea461eef4bd96c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "95a7dc5f0a5e0ad19a31c33df899be59cdd4ac6204bb7a3fd6c64d4a0d8426f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "08d650f73a2c5944e0db7cdba04d99f9289982c79a06bbc88c9dcdc4bf773d0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0e630ed07b6c6d1a02eca7014c66868e20c937ef096bb74aeaeecb127dd715d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f8e9028ce59d5274f5ffe9e9b4d7dd287b627136d488f3c7420d7761458a26fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6f36473bde925ee276368d9cbb28c7ea021691254588a958a9f81a3dfbe2f959"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b1ab1ec2aa1da0453a52e64ce7c186ecbab442f0e98cd3b5f451446a49344f60"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "aad49e304633171218d63fe0537ed762f4046c23b1697c4ad2ac6cc07c3188fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e676316197dd13d8c282b17c36bd65807bb7aeaae2f54d2eb05bb2042ef92462"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "106b73c452cbfc363d32175a976ed67209fa42e6d35807e3b5bac9bfc040dc14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b047a83fc5ed649269b6401a54cf15e57ea4db079d8d47c643474746469bf1a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0367fa74d8d1629e25c82067bb9e41803277e51f5507c4cfe49e9c8c31c2032a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4ec410a84b4b4906ca9f7629e8905f6404c7e27c44b9fee2532bf7f5661ecf55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e25a0f7c0877ef5536f931c1610cf6d7072bc96a992d42c04734bff862e681e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "db927063f4363ccda6c29dff77a2e86e7a470ad82d9520575a5c421458f840f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e6e0f186a085fdf920e29303bd6554aadadee43bf1b96d05dc1216545d0f7495"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d69054c2f05035fe4bcafdc3be9b92f10de70a21fc6279d696007114635b89ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "e3f253e5fd1cc669f756cc0a2d3bb17dcb625a23e4f83b9a103181ed0835d712"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d47744acb8a440f5c2df1f6252b4275e3a1e0b897ab4a7ba54f75a9a7bee533c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9c20b7bfd4f840dcd4b8c58ab2aff654d5566ba83b52722af5c07fa47f692743"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "27ad96e6e647c17d68ec66d84347166a40f436d6da6af275252dd142df8ba3b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f33c6eba31d12f356a6b7d24da6237ac2694faadc519234e0a2f44598b70b338"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3be4e43931724a739b1c565b05a7b8475aad81866e2eae4210f86522ed61d2ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "08907a0f44e2afb3e0e437e27f00e71ca55dbd0555280d2f1f2c9f3e620f896b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "94f4133ea53d0b591ddd42746845054b51be738ae8f2bd0091357d3ec7f78b33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "36a7fe126247f73684ed55a8b2b172259847bd33a03f6d2a15da12fd5a99703d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "eda1a3eee17e7da8bb406a3a4b5d151135571b008ea876b3f6c89c8ce38bb7c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "20ba73a3ba774ce16384f67efed4cdbe9f82740be87d96afb0b4f482695694f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e1714bbe14e0d91e9ef4f4c50113efb1880af147ba660f92b79d1faaee75076f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a112036990ffc48b44ee9ebb9df4a4ad5c07ca2436107777c21fa0b446603a7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "861d5d39293be9e51e60a8d8fa55cd6d184a6fe600c3acd43c1a2813169ec93c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c315224dfc7920c25ae14168ae24aee7e9fd399a7d3b601e09b13a1c89e1c15a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2a23695591843a17f99c1ef4e454decb300c8030209c63f1ea8eed4638a8bcdf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "89e2413c2ffd8cc0e60fd395adb89a952bb4533b94032750ed4f0695348f10cf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "42b89fd1dde74fbdec9a88f659944e697a086de64946cf2e3eb07305f82f7e56"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "94b8e8d178f75d4c487e3263997226bc5e274a32aced385aa8a57d667b1d9b3b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4fa19f3cd155dab6b8282414039ec490fa164763884b794ecd49c4ba9977e4d4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "210a978984649b42d085ba9fcd51f6207464d1c53faf2afccab787139f9023d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6c1c9b4a094e42dee4cee3f1e73b564b430aa813a5940823ffb790890d50bca5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ec758644aff66ebcb777d5a5d572bc8866e44f968e4d644dc862cae21de3880a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "948b4a15b03d1b0d9bd79caff82b3d8392cde3c29614e31a92fc8d88d5b9880c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a29392c44076c08511a806dde93cdf5d04bb23347a96be79d9f4abe72d3d8b0a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "eaa105ccb200312834295f5bbb7c0c6b0ebd28b2ce2793461c011f64bd3a2750"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "233319dee83c3898346746f88c72b7f294b9fa41c557d2e56f287bbfc4d745d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "90ee4bb2339563b443965f5a93a797a49443e4089b4e7b0673d6361fdf726885"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a29a8dc953f6abd956d44239fea9cfc718dc9d3ca7bc4edfb30143be104386d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5c05358ea7fbe99f44a142c38799aafefb0b1e3d7e5b91cc383b0e1542a58950"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "11a18d6d8de9c30723a3a33b13c0bafae3efeb6036f6c4bafbe1db2953709e5b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1c00913bf879993539ef72353977b74f9dccae7f7d0df33f205a68cfc9732044"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "46e8a621aac27b4532c7c2267849f9122f3792e3bdcd93063ca1c3679b67d840"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c3a613cdd1430b634c44d5f5a74d3c4e62afc96b8547df1e4c96a91bc2fede8d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4a32719337a4fea2ea5a4946b4715ca3b6871e791bc947a33f9fcd73868096f4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "9d6e91a1c09e84ab904aeb06e2118487fbaf69adfed86a2c1fe502a346e6b70d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "bfd5cb7968aac2ad4fee38240eb1e0a030c3706591b6e4c1e558022b8b5ef0b3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "60d0df213287281da1f482a94e919aa6a7915d83c7ad2b2e8cbff4a51cb4eba2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a3bce13996f7a15d82668959474f19eca4aa86cb1fd069fec298e2bc7da01c32"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "3892ea951b0e58c647bbb01342e04c05d335cd2c44e2f96ea97bb7ab4d939066"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "56a1296780c9f5e7b4fccf94be374154686d09c6ae82592bdb553417933a2f5f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "35dd9ede883de5587b5fb802bc605262f48bde52df1a3af627e1d3700d4cbf90"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "265e6b02adc419b2d524fc7dfa676270126bdcac8ee694acaafccc2d9f7ceb25"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ec87b8b5ffda8da6b320f6d407b1723b9718c80f24d7bb768a25a89fb31e8a6d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7e9dcd095ab3a7bc17439c6bfaff9ccf22692635d06a23c5919466c2bc206fe5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "c73401e33479508830cebeb2e0245c5af42b096085f858613f8ca8de1eeb34fc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5550b05169912c56630c07ad52f5ec19bd50603a858a2105c161e4ccb5ede2b6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "81b13e079642aef4808d2c9f1690565844fcdd47acd1ab98eb3fe2f129c58820"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e795686f2abf2518d3c21ae1333272ca9e7cd170bd774628abbd1ee6e84f397b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "34750157f4b9525561951e28ff2b06f03b7a13f71df340d33249cc2b6b6ff95c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b24b5adc271c26b153e69b10970981afc4be986d8d3c20f3112dca33d3a2c42d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ffeb8c4ad21146765e8eab13287ab0b1c82650a35ca6bdb5882a4487c5155e7a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c89592228435e89ad2bebc941c2392c66caa3bd4c01bfd54305ee9e8fd322edd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3d3357e807a0999f38e87f51014c331f36d1f639dffb9a53ed03f11b0e7a666e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "920258ea2f505176f4279491b1dff7e1d3045100f240b72f73726f017d70a7b5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "7d26030a63204e17b42f5d9ebb42998fcea90fcfd12903795b8fb8c2616794fc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "70b1b3453fc36fb0ac4f79bde15b139d8650878a1075f8f41af023f708bf5a81"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "977242dfc724ed6158b1629c7a0cc7042f5f5896394fced5bf5ce9d419e97275"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bd211e5ce680c65be00ed106ba6db1f1a394faa25d5e27cf6c19f94cf44c1668"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "eb4f77ac7151d4bdfa635db2278ded6992e57d551ea48c452e8a46e9beb53143"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "465294086722f8939488f59b03190e97574da9e87d9f545c9d3d34a23399cd7f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f49256acef3f5de04dc3692f5c366eea0bf4fcdfac212fcc345121f7bd7bd2c4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e89ad49b5a8eff01c5067c9e60c4fbfc83d866aa8c829112eb5224d4c8dd2649"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "c239e9586380ea6f117b417eacfb3132938ba432f4878abc7ff9667f7d223404"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6a9ad79c88667f507e75d2acb2b18c777d439c481324af5d079cdb6961c52604"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2d87708903d39444b6175b27e0501d4087686fbf3d80b7fa4b2f20648e0eb677"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b6e19c057872f65d3e8970916b30d035ca6b9b1db081d06de6ce3325e871c5d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "9eb488b41ef43d2dbf39a1fff7585bdb3db22cd9aeadf4a187cb0526f596a194"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "7ecb8530d470593239eb3c22809578940ee0057b585c0c3296218d9637cbaf17"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "3a28ab9f97c3b755a55c988beb933229c607d30f447786aa31ca44ae5c9140db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "ae3c7d72aa4abb0cd00d9ce11213109165faed46bf11ee24263a229918e340e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "c9391aca869b4a77346d94aefbd49bdd8946a9b99e6e2f7f7602ba5a15801dcc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "5ada733ad05d07af3771d2f17ab8c9758febbd73c90dc320f54222217025b0a2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "34ddcf964454b5c36ac396a9bb57192fda5aa317a25598595f6f83fd2d8e397a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "035bc6ccfef7c29432ac2640149ba2f8263d812c68c3dd4a6721309d65064e3c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "46c45759f03b2b5dcc5531f0f84bd0ee5f71804ce2395c2abb9523c58bf73d33"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "d4644d09a7db1835d829a017aa893d4054a401c9d747a7fae361dac4b1316a71"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "9257890735ff4b0b1aaaa7c32cf01fbb8ea085ff37f7243de8ba9f6b0b99a32a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "65525228d2abd6722e1dd10e8b7c90c3f6c085ac0a07cc4a84be4e2e86c86417"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "4b77e12fef88bf23455d9a1000bc85f1df7f621e9e8c1e42cb1c90fbb3a0413f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "301f0cf7d8848a6481cfe37e9a2b0c64c49663347604b7311497f91907d5fed9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "daeb1caa2322bac0df581eb28ec9f586d30db2e715aa4f727642d7dc5049c4aa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5c8d7cbd5a8eeddbb237b7e2fe8edbf3f58a300f2ccc733cbb86519cf40efe1a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e4e91ecb2c19bac1c0d30dbd12de926859da6616ee4bd5e73e632e6a3aeaaaf3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "89ebdd6b702bf63f291f5869c9f59dc8367cb38053aa5e28c2fd5df97665117e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1d7576049d08f0ac4c4b33b0361c4a4334fda7ca83bcfced97a6ec275889a949"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "63b59ee22c93c49d49f9b5e96854a8e8aa9e5955d0b06f202f3ebf334d61cf85"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "4f1ab069b40ecc76b0bf6f8fd0913de0a4c186f28af3d9b22ec20522ddeffe83"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "4412f82acfe7772d8f6d25c3d65aadcc477cd100105539e75ab85d678b91f1a2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b2c2766370cc887912c82000d4209d97f70e46eacef898b5bc41d4c0acc56646"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6a7a87ae040efd362a541c5110021402d6350fd3fee4869b26861be977c8a5ef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e746e974ba98d93dbf6b9b71e2d17746f59935a1368fd4a670894b3aa300a6e3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "0562f2a45ea9280e9f2c5b06d9e9c8df70cf9945d1fbaab73f6c126d1762df70"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "3b58e7113e31f9e9ad83e7b9b8ff42af20a5bf0b5743d9cae4772d38e390186b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "5ee3eed36a65c02aac54991bd2b48b9b63405e01f6437fc1a9efa673416497bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3885a8d69958f95ce455cc56cfb41389694c2b80a9b5b2c7d7900a98496d744c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ebd2205d3f38d8da4a5325a60f80c4d5343262ad6c4ccc4df7b86057a7262036"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "2403040ec31255e3dcd789958de9dc3dd3c7e0eee0daaa8b8d8b35715d948356"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "a9602102b43879ee15cd062e5de86d693933ba711254391d31e5d335cd779e74"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "8e16f9e2e10902f37349bcfb31c7ec458eccda91931b1621499af6dd0ec49033"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "aef807d33e01e1ddff0c465ebd4725fa0e7cfd930134553b8d55623a03ac01d9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "48213be4fe207a033dc7fe28db2cde1676b3e4bc5fb77078e5ca97a6ff26c5cd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d958ad255b68584d7ec0d5e552cb827fddc0231759ae30d7807cb7cead8716e9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bf10d9bcfffcb15b08df87d860033443dcaf2a18fa3c48c1ceca768dad02eb5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "801dee46cf02790049259cc3cf5c0b08d58e034376d519938ffc3876d8348330"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "233d5d1d7c62548355e1ad48d16eee3fcec541028c97ba930121ed1fd30accb9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "b678dccb2f50757e2eefb729dd24d0607a7b756a0f3f3d75f6d8f93e9118259a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "547a0571566205510818e0caa67b246c12ccdae27de34f6417665132b283842d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5d3316c5dfcecffa5ca1d0c4ccbef009d9a17ec1951257de31e6abdb3e18b429"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7080a5d47ba36015c56990dd75547c2ea63de97d2248659a9a542aee598db8be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9e56117b14beb3d4925f521d282b31866454f71d135ba24791b4a444ebde08d2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "0715b79bc13cda0bb3ab56dcad0c17b6620aa1d585e1abd38880d69ab693edc5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "63ce55de465452c73dffb106f2935f861ba25d5c7d4bf82770d4e27f23aa18f2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "627a398d48cb6e5eabcfc767e6edae23d74d5c3fc5f73563d111d41c8213991a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e776bf2dcb68692601c7f96e47c04ce56d1c9514d2f4f8ec6fd461247096d83c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e1bce82a045e64001ffad4d52eea3cabbce9e03555d545c094c492d65d98ce96"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7d75b443ac2450c2af356b98cd12cd26a0f4230bd739d1a875cf213c42a4e6f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "17aecb995152547dca8181024b171d9299011c56ff53971f74f4fb33dd2e5c33"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "371a6267ba5a2379b051a27a386ee52310bb5df608668fb055dbe40977025a54"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7cc4d344013266a7c2dea9efc3c28861c9b4195e69b1e433110bd1500b19cc12"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b5ae4f134a8abf613487d77560dbb96be6a32f208c1f6d6c974c1b4c4abd98c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e899f22f5ceeaf484370678296940b688037aa7c90dcaa722986ae689479db9f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a0c950cd27ff0c316a32350cb64cbcbc77c66fa63bf8df0759d8cd4141fc6b72"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6ead5dba099b413481f544008e7eb0150fe6daacc2d905fe1a45da9f1b45f29e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "56d82c35a46a1e54a73f337f7aea7d18e95fd59d5402230aae351a0175ba5f3a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "0c179d08733edeab85082f30ecc704f5252ed96df92dfc47cc8fc4e5c0dd8cab"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4e00dfe63336a78288a5f0528a77753f430a2527f590d45eef806be47f23fedc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c3ba861d9f07b3c5563d99a60e6b89860f7cca3c284280e5d5ee4f656f6a9ca4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "f6a3bc57ec6e81e73a7cb5636e4ce60840dd00a3d993c595d06e4381397028a5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b1e399540d235200407775510fc52ae03acaabc7431d40ba14ccee7f0dd02205"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "41a06f2b0af3abe4487eaf1cd4ce03298e449ba03ae84c42fc6a5220a1d905c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "83892fdda43f8086ee2b05ebb0385c664d27fe433394c286da11d84d8c8faab0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "46dae84a8785e057dd2913ffc1b7ac7b28fb3951a94e71449f0d32a7ab677610"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b925fd5916fc3384d6be2d9963249d73e9c24422972f874f1acd3becdf01ce58"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "05204f9692f489b5bfad9bc9e6bcdb152715bf1962b480f794908926915caece"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "e4b5f24ae71b91ec0a40329a84f4e7a5db9ff6cef530841ea39f3360c3b556da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "d1a19d3908b778b2201cd64269bcb7ed50487b97c2f922cd8d5941f192128dfd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "b7de86a96d6f9f27eb37c01de7b73b2135b8f8aa6a229d1439f4864ff7c320ca"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "cc3112343816e0f29e66c46a53b5d4c6e09f968e106d98d96107f4bd834ac7d1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7990a1545bc8c12a020eecdc2f2f07a84f39f393bd51153598546fed49d96cb3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "cc122955e5165fb70db55ca15d3016246aa146717083560ef86fdfbd0c5f1d90"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "f3519dafa7e9be755409746dc3016c80ffdc71f1cdd51c44db8293c093b26c32"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "1cff7ef8634a32965a40add793acfda29033867d17295e8c5ef340533adc1def"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "6ce70c75939a46d3b536b7ef930d71d86e64a2a82fcccd34ed8c4343edcf7fb7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "77a03f1848e076eb8ea6ac00ed4b8aff47fd50c94fdcc0cb348588e02607ac27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "15d004b96f379bb39173488a37f526ec9fb2bbe3bd27ea8f9fa4a4eb57ad5852"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "5406430c0a4226e94a087e3611b8d390857fce7d81ee2350c33b3ecd0106cdcb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "e6ca3fcf94815f56aaa28e48e9f17da7564566cc9fc7deb96eb259a7a503cbd6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "fad5a130b27870bf9610e89b9b35042cc916a005541856a51b6d2395b27f08e4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "313e2c3a43063f021d25c590f6ebe5e95d2242cbc2fdf5a77cb45215e98901e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "d7db59dcc98b4610c93e80dcbda8bc0fca13b6bc458eab4b175fde5fd100b70e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "34ba328a46a7da389e3407d880c8d557a35d93cf601581a51f94744ad87af1c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9a967817f9180ad7be0321b0c3a530e7d555645bdb021dd25208fb2f7a8e548c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "57d90db271df8d398a784988ef8a8b2fdfd13795a0b19c3e7700a59619eb4a2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "fd28de7bf823d342cdea7b338e8754a29c0392be895234f74037f54073f4e588"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f45ef03a55fcc7ad0176b3f0cc10adc672f1ad8796592f9ae3b773ee3f891ff3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6cde8385cb79b45b18ee2a0a2f948baba3b028878c3f8e2bfba9f4f1c68100c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e83ec2f63bcd5b5c31e1792279a5126d20a2e72c257491f7a8e9731999a424b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "2fd99c94dfe81985974a9580da99b41632e04f173ae700d84824de3b4ed9e960"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "11237bf3272e52dd695efe698119c05189a6fd68798d27c52bcd108e3f46f0cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "9711eef71a6952d92f3284558348a98b0ae24d8ae45a8e99492019f5efa26c75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a0c712e0187742929a3788b449c7621519b203bcc2fae689bbb0453d79d30357"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "8c22c3c314f61491947a701e6f2b99423957099f5e2b5088ec711dc285abaca4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "ac252ad14db654047610f59b89e348937c65877baa1c8288d062c6f01cf13eb8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2d6b90da60a4c29c78b69d2e4ce47c9bbda4d7553ed65bc00db865d788845cbc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "75671a8b5d71999548e1fa15ae0e3e94e19ac3abf8ef1d6283a8b4c2463f305b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "078183fb5ad20313e576ef6781287c40898586099e847e6d4e1dbb6a31ec8841"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2c5e08e51798eea75644ad6f295a1ecafc9dc0786cc913ed412d838fc5f17b93"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "34d7304fd7d421461d730d53d64d0325a7b199bba4f6c0af591759360eee4d7d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f4dec2db876b45ee0d35a5bef0421ed64130565687670db07dc97a1cf8134d5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "5de0daa22350e6f55147bbdd3e78fb71d86d5ba04d990651e20bec14c3f58c09"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "af2db67adb8be3a22a331aac3cb0e37ea6c47cea78630e697ccdb6f098af0e8f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "3abee79392b05a2eec9b9c82f59c813908b098f40eb0a3c999790b84ed75d4f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "24933c9d149ed7c4013bd3fee8e4e1d8285a7e35da28143f78121144bbfb8d4c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "71397831a9918bd3e7d07aa452e9f8e8a938e85e8ac9b7d2dce6231a4f3fbcc1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f4c66f5e1f4b553afe1c780cdb1ccaf0f405944c4d7886c5e68270061eaa3bee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "a5d9a72f80c517f579d5d02a265ba10f962c8a9d5f377b5c52f1932ac739e990"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9f9771b2dc9d5854c1e196cc81b13c9608d4fc19b941650fd9ef40c335f0aec0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "225acb8cb3c4c87643fe2c66122a888b11cf72d1f556288a03f5d9a8907ad2b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2e71b98346657f01853fe434c38dfbd78eb9ad178f4d452c776a4777f3390c8d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "87f2fcfb8ada77b337d19f66b1365317933146c1c3201572c9c40b2a37ad7f73"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 193624, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "35a44f9ad956b0414dbc232a1ec650e3024534af26300581fad91cc103ff4f39"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 205992, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "9862c7d4116642d2e9ba56c5ca836799873c38bdad2339a19561a29ab4898ae3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "87e4e755588fe0b249805bb9cad8de90835634ad71e6d6e1d212f572b2baa977"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 179800, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "d04c1b4629a259afc3aa47e4f76feba873bdb2e3667bf6c2b3b8b2db68f2f0db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194216, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2d3b1ccc14220e67007762da8236140f7d3fbbe04199e471ed242705d9804eb3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "0a5c135985e4b1c63d1ead364fdc8869718b89c102b7e8b66e3414ab1595ed47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ffd677b1f4c2524b336e28d7cd04529315a6d80388b5a9a3ecccee7be2ad9dbc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9022f140c7b8c97f2a3e446692b331875385268e3934d779d298d3cf2ad6001b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "85c0756b878ce5fb5692ae6eae59022d6f7f75bae2589bf6d50f86fa84099a50"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "60dd7f7fe2bd1320e7c9a31f9e5c93e6893c219b922a11b6aeed5e8f9469e1ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "944448ac6b274a3020f8fd39498371dc9e298bf3409821809f3221e8cbeb3bbe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8e049ea998d8cad0bb52363c08f420b29e564dcb094c744d18ff7870350435fd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "34a39135f85f3d22322edab9d3ff57c7dcf48baf584239ff064366ea583d80c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4d029f68aff51dfbb2f21fad4c044667f190f2129279347f9e6e80177012824e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "0a880117e4d1b3b4ebe118efad99a650b8f22655c8a7cb0d7232f8c2c616bcb4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b8d26eec0f6964f6b8a33428c6cc8c371836369544069091bbaf8b09803d2402"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "043a07166df8feaea498b13d03c3061a209ad25594436c5d386bb1861eca0896"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2349a52b35731c75628f8135975d8823132f7fd238a2c58366224c100fc0168d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "829fb4fe670a071d72f784e8ca54f1cdeb30a336081d2806a78bceedca4b83fa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "d3e005a4871d767b32b2c374b398c4c16395b570daa665fc62f7ef4c12191eb4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 193624, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9d4030487603917b8e4f02a95984323a407342926fbab81205af1fd15b0ab67c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 205992, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b81b0ed0a77fa9a67ed7a95240d11ef8e5e8d05aead0386f6a0dab07c4666b3c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "034d95580388bc51bbd6198684fe9178b7f74d9e6f1e63d64b382a75e7944c19"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 179800, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "48c913ba7cd7c2503bfc3a385ae4326edbecc793c1a31603c8804d6252dcf4fa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194216, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "cee4dc5c3981602c3f66d39db77540efe6663c03e48db48e2812cf668ddf2988"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b68bb2da7618cb08358a9916e78b2fdccb55a354b13979e792ec441b330d1a37"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "33a9ca460a233bd2a7d123245269e9cd1cd2ce811af48e08c57f94cf1a147e49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "69d8bf2d5ecac8f3a4d1ba5cccc9cd73ab34176ff7e24c40b35bab3ea4eadfc3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "dbe350e80a7af918627ab816c004b7debd6b105c6be0b092d8b1effa586cddc5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "4a8e7688e9aca7e0f463e479fca923954d3c87667679721a8fadd949c3269b60"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a2c13e06157bc6c364d79d81ef8d76deeb702a816cc1bb09ec526fe43969560e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "8834e231eba684f71cc8346f72ba7fa0a990acc62cbbb400dbad8b93f0630697"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4f8a4c443e3265b0546597e43cb8af561d54fe916514475cb735174d4dd2d185"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5c01cfab3e23910d8f1c1911eebbfc0c4502098be9e2306a7ce91b020ba7ae1c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "5e29d7230b1d5c9d9836f6608b7db4975d6890140b8c0ac8b37363d12789dbb6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "861b31cf81579523e6b967a306d72097e58c28310612703df5f5d3f1586bedf4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b651e0f427b1a501f74d88703b915ced4aa932c4a540303fc1263adb82f536c0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ebc29ab20a3b145c23a2834dcc0c396c75e0985b2a6688408ac3dbee1d0a2eb6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b3f6a56fdf9b32d72baadea6f4730e59f53560c10210ca84a7868ade5888e89e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ca1eb7b8eaf01523ca36d8729f97848729c5a0f9e79b854919566152dc0f45b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "8505382372d79fc4728e71473dfc5dd6fc06398d8fa6049bb311304845fc9b5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "e104b0d685e937d079cd184decf3474caadba97278bad90d947506afe1bc5623"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "66e14ab96c059793433086ae8243843dc848d3d75f5162fb1b9715f9585695e8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "399f488f4b0b5091fee55ef5dbe9ca54f49cd59f446b391a3177c4f80b201229"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "5cb7d06f6cdddd053b0832f408efbeff1d4f9cdfa0f2fc3d2125653e4b98d18d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "c318c13b7415186a270480255da460894c32f924a105944bd8588510977ce640"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "2fd0a28632240b0bd7b5807bb30ac61805d2c17297640ce0f82cef26574b3b84"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "c517691bdc0e1a1bbf735d0818f6d369ed6fa5877ba73513a8441884b440f39a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 193112, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "df9c05c7eb9c767e421cfd766982fa76ab84c1b1dbeeb0437c37002b9ac04c75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 205480, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "2c7a49bb39d71917e86d0be8e58215893b0fd28f1975e494746949c11da878ce"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "e0c0a21c680823550b87e4ff771654e6410b9cd4262408d3cdd584f7a12b6f66"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "2e5f6ff8b717dfe4c4fdedd589b34dc5c7a22f961c0800cfade756fbb89ac686"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 179288, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "00b574a6ffe353b0c57df44ce2d1903b2f89c00a4add06b70146aa3965627236"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193704, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b9216889b095664df20f776a4aeb192127a4a384283824d16905274e9b1673d1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "9ed363776bfda3c342a0a912175050f90d2515b84cf9be29446997770cdb2847"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "5ac988cba4ad943bd1d169202ed5fad323698c2bdbc37e84bac45788049b5871"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "43ddf7989b79c8cfefcb13a0be27ddbd7233e79ab4f955cac36e38762663332e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1a4dd92f15832be9cefa406dd6a9c75b313ce67082f789365c16a8cc5686207b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "5176729110dbacbeb3c9e61d4c3f59404513aaf3209b0df71004e26ec12782f5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "efc294be181edafd9deabf67903ebbd53e787fa4261b8d8e04e1e925cdefe78d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "01bc1cf35b1b11e6796821cfe3173847a95e814515820e8d3985c1eb8efab5e6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "71f4ad208faf4afd9a89fa25ca8f9ecb3c4685e80ff5815bf61fd24e489091ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a489f978c4569fc9f09f60ae056cceae80c804050ea158df1d7f4e33fd0ec948"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "d6d14e9d7e7078556861eac7c92706c539f51c24ebfc67e037d7f8e8bc5d5356"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "8b09a8d8e0654d62c2468c2950aaaa167dcdc0e5dd39aa3955ca56b126baa707"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "f5bd36747095e28e75051970fd2f853622c0074dd79851d563013e382ad3fcee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "d23bf77763ca8e29bcb38d41329263c04b85e24e5e7e2b94ca6cd8436675ec40"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "933a48064ed2bb83e7e33dbaf1bf0d9ae365a7a50a84d29999bcd1468e41c9a4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "5657be922a490437fafd9988858af6444dc76d7ef0b8c453ea474c5ec6723510"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8c285ec588e64bb12af409859a9c6af03171c756ccde7be912fe1cd3d23bb395"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "01c312469d57fc44abf4495631c37f4f0ce4cbe61f03c0f9b33438edd61c7693"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "392208b619d553c11cba81f7fc1e19fd05426ae64477ff8d13ec63f22d942bbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 193112, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ba09bc221b241fa545a18d652ccd3ec99bedbf88c0370e3ce38f886995e3d03e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 205480, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7d0e008a625d0008454be0989f8b2c6f95d317f4ee42a2b5e8c9c5b0def9945d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "a4d6f69e11801604904ef504631e22a39cec4cffbd8d2b32f4e0479687e6e968"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "f4413fbe18f5a22ef3950e9f3abf67ce15e79e00fa1cf1357583476420fe0462"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 179288, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "833f45ca7bb8cc7fd10a00d48f291978da22f19df0fec0f6b101b9e0104e0dd2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193704, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6fbf704795af474db5c21ced277db388f6ae703e0e3edadde0dcb34ddae6f087"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "f06e7bd22bf585c7407c505b6612515780a6ce666573e6c7ef32c9627eba6551"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "6ce64e6a5712ac4c90314dbabd64dba85c0ded0042d3042b37b4167b95ee027a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "d20e24b5aaa2df189267bc75f39ee2fd828196053c5996da970b113f16c4870a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "71ae79cc36befb462a5b345b5f966c4d03272bd55e735f16f2e384b3f6a0f605"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5edb9e3035b96e3983399971370a1d74ed3caf65f81da9cf3e9de3fca7665a1f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a63fa530dcdcb633838584c242f9596923e3e7affe7cb066f4296c58540decf2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9d5ba104b64888d28e6d7f0a43d532404cf8d8aab5e5131be583c697f1bfb33b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7785ae1e34fff64103da967bee77d313a6b6641910e8f2eea85e334f0bab1aa0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9c74ff8d25ea429a04d5fb71586039e84c7ce803335556c6309837bd8f35ec44"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "77c26ee20e5127230917dfcffb14ff3d59b28c5d02b0698b5dd40d989bd84ff4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "ef8229ed9af8a795af1ac4c9cd1402973040e44d385dcba1878055c4907a9079"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "c656cff6e93caf33143b7a9ce8bd9d5d3784c8e15aa859f2bb34beb315c2019a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "bd9b6947e89d389e3da5e949aba74b01f43c6bbfee41aa70f3258ebeca62497a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "2c037b6b3d433d6ef473371b6d18a5e8b460530305f648f3878d8e9c36dfe135"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8aa44ba72a3a4906dde6ee1a96d81c2cc2ac5c62222b771b676b67815b8833fb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3767376910afbb0b4d88107cd9b8d34e28cf6725e26ff5f579d9df8c421f7948"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "42c9447c38daa85a94207527ccd477cca928265ed3dba822f4d8c5b121b63dfd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ab9936153f2c12d14b620c982468718e1ebeee624f17145702ea78bdfda7ce2c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "db79e2fb7c41e8aa04954efe441bc2401c288f44931676d3cb0bba70a4efd9b5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "d5918cca59928b44b496f7d747f35122bef1c8d1a9c5664d66cc5b6f77bb9d52"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "258320bb363b5ee08d390e28b42cc1cf4c475509e2530f7ce7243de81b240ae4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "fc1b66512cfe1d378c0271f29cab0b7e2f891f48b4d7aa139ac173913f1621fd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "e7e54e3353ead00a0fc6290597e6a206d291c54168abf7c854179a5f42736ff5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "ebf1e631c5934087924d49f506de18e70768a2bc70ed9630289f75bbaeba489c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "b95b1a2e71661d55bdfd98cbf39ffdfba0925a0a93bb37691ad558d990d87e60"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "e0995e123ffa5f9ee7f542d26d6bcc2220b0cd48b0035ae967e3d147b42ba5cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "191d5db7809954ceb113909514f801ff50ffb243df59861c640325f42902c973"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "dc5a59e702b393a2cb5e44041625f0d1abcf7820714d74f395e913d5f3cd21c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "c0638c5f61a27f7a3032d4cfbbd683c5f46bad2a3487574157748a3288bb0ab0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "e5e79cd1dd2a346dde04874554cbdcac40f8810f036e47312023d8da0c1e8d9d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192856, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c6b0384e883327b71445f277a4a3e123777416f3ffb79ae18e5d823e043bfe9b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 205224, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8d71920f1d628ad57e6d4099644631140b25cbf51dce3f465480905524707b8d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "403faaee10295865a769c4a40a603ea0dfacb79f39a72a3d1407fd1518e7cb2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 179032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "87fe84b8cc59bc448393579ebcf90fe5c16cddf550dd89f21f416e35d6568bfd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5d87bd9e6f497afd0f4646db8b758e1101db058e024cbbebc4f2f3d6a8556d87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "bf5d2ed915d9971ec4c28f85a98c91893b7e7aadf04bb3edbacf8f5770b9dee7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "408f46c49907a85fee42e1e12b5b6717a04c48dd929bdce41ff011abd509198a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "efe37a15bb20e4b4827ec9cd2015b7b9a308344d5f8c53093da82976ad8ca046"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0a2fce655494c141fa9cfa7bb45046d3843328f61d900cba28a1e715dfebf3fe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4937acd12994946ea5e8a5825c0e43313873f9c6c54e951e181c8436b710e9b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "169af815b2bb89f2a29e0d1d3fc2c81479540b7857fa3008ac91d8ea967ff6e9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "44bc30d96ef032583ad750ee4565eaac194f1c6453fc7456004ff9093fddef86"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4a49f9506cce0af5d80445ebcbc8f53838f49fa8b918b1df001ffbc68d4ef339"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "31b6fc5fe5cd806e0d58187a2dc7e5d41327e6517431b0c59479dd3920bfd0c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "0517df9ad58c2250350de6f2a1b5702d5079703f139fc1535fdba723577daf8b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "bede31c2b2806e44bf1c73daeb65063e44324ef0afb7c873cf62e9251490df4c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "575bcecee34fa8ebc045c956ac467cef802f6e039dbb93d3bc4a3176f473568e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6ce31d3d0aa494d3ab35509405a2eed2dd864e6f95ff2f55991b8f45486b766d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "4bf461939a2991117473340808c3f5dcff17efb176e47691da349d38f8bb4a24"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1686a6e418a3c3977240318342112cf595e928c5c58e1415230fa1e8554a8123"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192856, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c2911ee6370f38bad484a264d55b1ae66d5d1cc956671ed387ed5c08c54184d0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 205224, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b98b9074e8bc4c8bc230481b44342837cbc96b7dc450ccd4e0579a52a95a5749"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "6faf48e28d638252396a8708d2fa47649485d209d8e99f50fdac83cef74520e5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 179032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f3c1406c0fac252267937261f4b98d3edd1d149fd98e7295104b2087beace39b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6dd343a7479da2cbfe2dcb90f7389cbac7911c29669441a26d007646519dcd47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "85a4b004333203fc9430c2c32c5a5ddc465bb6c7de279ee6ac813a1ab4a30577"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "6b34f60a064af3cd55b889f41a02befb68af898fa9d9661128b80fcd1c662b55"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "af71ae13b5fbbdd429aee3538513642bbea109f60435e83ab454397cec665fa1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "41dce0a036d61b3446c1e4a9ad912dbb75b55f10fc8199212a038a303f6b730e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "aafa8fc4dee6021386fdc40095eb940f5332912f711b58a7631b233cd6833b73"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6c2d520a8e50ff54b82fc3815a9db06df617a8e751f727fcac78ea8e5c415412"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 159824, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "86f060208a8b084c94cd24569991dd1a367855c01eab101f212b25dd69b337e4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c21e0eb18385d4db0e88ecd367a3054ff15dd2e84ba97bdb33384d4f7c952a21"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 172192, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0ff2d4d51d8656f6c986e0a195f1f329924ff59c5d1ab75c8b2f450849450198"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "76f5404352b850805749effe0cb75fd447f4897bd0f42eca10403df047769fe5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "a80d74b6d5d94094ca10db5f4a44f2a8c88b1f61b12ea682f049403270dc7c32"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "449db820cc72468dd68cb66a0d4cb1ed1449e1758f1d2e612b41cc622e683975"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 146000, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ff406676d168825788b7547e3c0e03d3ce1e64f5d5a17fb1d503b1418985ca0b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "7c123447b08ff357a72f617b2e7a274b4552f0877b12d5b9f679dbe691d89924"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 160416, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "4d931c0171648312d9a40f071009305ae20b3c86be34c28b0cb922ee5f320777"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "ddfa799b0862989d99ab1f003531c583cf76337566052708a2801ea5f700c2e4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "4e35bd906dccf9768693f799b64db43d72c5bab92be5457f2cc3ad3a321e89c6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "64d028a7c36b7aeba32b5920944d98b36d2e65495ddc3cc0a238171e0b940180"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "92c0a63f353ef7de7e7af89af1f52aff86eb35f73822cea2c5a3ae5a7a04af34"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "f821c1b2d7c87d84619fbbfc2ca4c9ed1dfe0f58a19f53d426d9fb9c32b72744"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "df00b3a5234f88963031a75d433151d7ef3ca8efffb46dece3e23bf531cfad27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "830783bf4c9ecfe901d7779358683efd8aad701c959308a318e094eae03e9f1c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "3045c5129ae5a6e4d57148e26d6ec80b04b203b7fc8623347f5179010e88eed8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "bdf40d0d3c3848882d8528f39ebaa84875ba5c41d78d0993ac9d72e417f30c90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "8c2ac42614fde3ac968d5467d819680c5b6b4989b5b72189b83052e4597205b0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "cf9296db96c877aba714272861412ffd2c2930746fd669819f089595da576bcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b9a44139ebbfc8b2de2956fe2beebbeeeadbb11188495b86cb3c4bc97a22f594"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "3a00e2f6540f4c508d2fff0ce05cf25da6ef0e66124ffd5c6232a185406c45d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "098bf66d1ea12eb17772fe7f9fc802657231c9e36505dc6f13646f7fc462d652"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "addff1c835b688c0f7ede2f31af5a883d69973c0bb369bff42191dfdfcdab3a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "338cf1ead3a25458a5ae9e06135953a1464a51f320f0477b97714b3846434ba6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b83773f1522a204719eab0076557a438c7ab1530555a2a7a0ebacd3db7fcef55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "042b65fb3bf5618397ce160fdd45f89cce796f4747932c732198f8e5e756c130"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "45f868fb17eec8984404c2c8b500cb4081d30dcfd22fc3c81d9516e55570cdce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bc490ac9d4c7f74ec214a78140669e427923b28d272ea9ea1b65755577b98218"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f12addf254182900ea104b912eb13a4da0abd1b398b6d6a46e1657b3e12dbf61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "43a6188c3c1f014e7668c3f299712b7040c7ececff23cd523e873295b323f23f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3c7347a74bb972dfd1edfac6d3664885344bc0d34e114362c511c81a2e3b6c37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "a7411d66fd78b3d93c287b02733397a1640b4d635626082b7ef40f0d73d076df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "149cb4919807555a77a0296eb17d9656a4b8c88acd1bbaaacd76c1753b37f301"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "4b77810b0db44086606e530b73e2ec4378442f1f8d15d7db0f69297fc5b20e36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "55ad12406613f14b21e6db77c6b10a2acf39cdc69e6d79b8564d8c5d3bfd6dff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d2c6c95172d8fa2fce4d2c6729fe1d127df34d4019bd93c7763fee62e0a94ef0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5a5ae82a46195d28d1163f053aa5795e5f71fba126ad6bf624b1156f78583ba6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6acea23a42d0d317d8086c351ee4352d807dd5a0d7e932c38667aaba9fa858f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "2e3c0cfb846abc28b0254d28d04687d436472269c060105e4d48fe3baf145c7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "722f7eb41b0839daac687b1124d95a77a0fbbcde299b7e2585605cd27f5e7a26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "68889b4fb56545a0faf4392b87dcfcc8aabd2bc601f2c547028355a4c2445b15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e3ed1a734695cf24526236dd61aaf5b55bb0a427bdffc1118942d1e22afceff4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2e5fffb4d384b685f1789686213569ed79729881564425b3ae96c1a22f3123a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "bcd06ac54e72561cd5981d9ee749990003f70cd6998ce110e032e70c7491be0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dafc542d42d15b6a9d478ee465d04542c59893a01e7aefb26b6019ec193571ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7607302f0d34f1369073f9969bec37d9d31697bcf730ab00b405d32c265f98a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "16e6fdad817653344b5b3316cad139f7b7530fa5f161007b7b89b6d5335820e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "84551dc422cd0d5d0c0c178c12f2390304b11808c5f582333198c1bc7c28a8d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a252453afb458f4b1423caf9baaf835917d807897a79f35b05f5403496836811"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "41911f1ca8b59ce49c51862d70517e104453d973d53488c93be6c3f25191b6cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d430acbbefeb886ea7a6535bd346e1b3afaca5e248aa647bb2c8f755534e870c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "a5a981c01913fad24fa012dac6cf34d342fc9e35b8b3deea955f2652637c175b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1170e6a336e59129f23fd41ac8b1461268637f39374643cdf6b7327f14e8719b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "46bdcf9263888eaa8ba2b2ca7e7157688495acb993abeb6aab29c8acec11dd1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "acbf0b1a749b8d7852b894d1aadb5bbf92ff7f582b59b010c8e4d849db55088d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "8421724b73416f78cafa9c03a80c4e00d6e0113041fb64a62843b7033d3d78b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "0ad84d801302c47d188a45c5ab6b799c4253db9567ed10b8ea56dc94c52dea44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7681865590886f262f6ea0bbb332a4873f7b826c203ca90bb03a5af278470d35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "51ea4f5f1e20f1f23403cbe654a87c480603bf01f94bd41121bdb840141ff071"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4d1205f8b6d6490e93858881813b1e892a9e1d075fc027e15d029962757c33eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "acb569691c40cafc71e64411b658be829a3ca003b57ed950659daadc2f8d5a8e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a3e89dabf213d8d05411655fb3d5d8d5ca911995bad3f9ca7256065f278f614c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fc41bab25302c064c09d6576d1d8b2452376c8d4cad2c0d4c43d6e658cb02355"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3c27cbad945294ae7c61abde73df75b7959adfad199679c1715cdfe5bb87f5a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "f2b1be254979d852cd393410e3995ab28eef7db342866d9d52998fcc24613aa7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "2bfc7d96187823e8cebbab8cf138dab334e6877dc06749d729386c020d5a4e3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "c651335fedbb8c0268e3dafba3af8d7576bbc27d20111b95aee478c846288c44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b6695823c0a499d93eedba4d2911c916e5d8f9adf547c4932bc8d1382ea2837a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "33aec117529b6ab479194bee008bf34a67cf7b83251d0442a93ce0436563b6d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "9dc931339f3d1852acc76f337629a995fb601552ffbfa6f9bdc5f54f08b4afe7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d678f175e0e909b6c52a51ff7f4b5241085503ad20df9e6c6068d84c3e492b32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "caa0a07ef446e23f154df3b713cf1e6ef9f5db3fdc52a8c3d932627fcde03a1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6c42b1f1857b9a20da1fd0dd0da2d5ea961d4511d99208b565bc016622ccae99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "f5bb56216c658e9bfbb37852789e0baf431fdf4c5f961f1de7727c034f9df8f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6640886b672347bf8dd4f9310726c0e0afd7d5575e752fd205ab93743f6b61eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "74b95a4f6d0a38ea0f77b6fac0b069c59da58b7d2b5c7b8927d7a9d6b9b804e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "0082a668f4b370f4bf38fac6ffb2c6732dad7b24a2b3d39ceb4181f3e0ad12ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "bbd2108615a37c4ce3758199d34155132fe207388857b7bb26e8ecf94250bd92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "20326e8499e448fac0f780d68fc51409de2964acbc57a7495a9cf92e183becbd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "50fefa87ff004dd09fd5015ba8c95a2941b000839aaa701663eba3a50f9962d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a2cafda22c9e63fa859d4e7f6f16e76889b9a3d8cc50a9029f5b7fa8b0b5a5ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a6715ac930c0537fd86b4f06564dd84e6c34f75ec9cfc9660ef74f69e788f957"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3a934e9e0bc070bff1bbc52e3f6f5cd31333d4d904aaa0762c1a3777d7160d06"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "1267c49e462ec0687fe37ad0fe20f07b9bd00594ac260091441d1afeb43987c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "52df788ed0fb93e9330edfb91edcf580d8bdc4c90768e0ca3ad138ff198bfff4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "b9fc4927bc06e5c48d9ee9edc563c7c7c9f5fd420a835205e61d50e27b74cd3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "26b2dfe09d0bef2ec3678295811165a09c967435917c0a1fcd9703dc350b00cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d70e54d0ac92738fcaf7b4fa7760fee6320530143185b145e090e53be5540e13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "1d2b6d98a8f6ad70845c570aeb081152f115f66f9dfe8280d83afbf6735416d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "30fc77575147dce034925e6771113fed9732c12b6c91a8a37f3ddf0e3ddc86f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "21937fa805089cce2d49ad0def6f945df5081d180cb71b9a240af5402881bb83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "6f84012d1e2ef5184a42a83cd325d08991a2b357590079fb3ec116e9ca1bf68c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "4438bee9460bdb2a1db855cf2c276f51c543748cdc6c1ad3c288a72c7436e344"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "115ee5d04eee67a483e2e3d7d7febaa594b03e8abf444f7eb1f24f0992d16568"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "46f16364f5e58269bccddc7459389f40aa82f97c468f82ae088e0a24ec114b41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "71ae3e7b91a63a712fdeb9107abd562d74bc6121ecd4f2845ac655a64964e634"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "16e83a9a1a9a9c9181072b359671b2e64159d7a916de2b38269ae813b8bad90d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "d9c7f46e893d7a5eab210e841f8c4308eb8e9c95888ec9abeb6dfc5189f65907"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "73efaa2012dd01f70e18875d7275c8ef2cffc8938501b73e2d777cefb3841365"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7bc27358402cfbc7bf29b54a2c72b549ae30c52cdb0385812b5520bf29a6a594"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b451b3da38cd1215d9d6398b22f442a3961bde3bc1f4bf18fd86721e3c38cb38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2450f4055536edb970974e05d9a8afc7712e2be56db1039f491c45d05933279b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "70dc4d651d1bfc89f62ab441f6068dc18ecc39b43fb58e686926e9f79c53de14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "5560358c8bf9a0c1679fc48ca8f404974625b7ccf649b2a8e3d9207fcae259d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b6e55edf56bcc1c63e9206df26b07875c99330fe06fac9a978bd8a88d2fb8f44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b2737e836a90c28a146d1d6408a99c062e2059399d28d55a4e423cb304d1994e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "625df18485fb9e95c9a89d58995ab87d120eb3ef6589824f402ff263755fb182"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "75589e6013ddd9d3884c39bead244f348ef77c5352c575c494e0b247d6374409"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "137b663b51d80cc2ee03ea283bf7219cb7b1473902be4f20ee8432f638216f14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6441bb4f5a993144ce394e845048be838eeb7f505177d0f3814bc1d83293e97f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c17ab1edc6fe3039ae393e1aee98110c1c127f29d27d7885a6c860438ed0d661"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "4849807101e17096f1cce51a5ad07f80111f66116f0bdc6c6f296ddf88b34fc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "06437011d2caf3882dca3f8b651b07786e39c3d329fdffadc2270ab2456a1b25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "c7162c68ba0dfee1dc2f6b307bed5a844aed6b64326010bf11356f3c70a62a2f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "051734d3332bd63e34fac554aa6dfb9b5a4dc54ad807ea0dbdb3570342a0cad3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "74f477ebe0a2eb4ac7bc91de6aa3b47d57342b6e0a7903b3496f015e55dc4a68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "ce6b03703a8a472eecd06b1e8ff3c28683c9ba67546495efae5551cdc5dcc041"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "69e02128a1fea766a2f895499e83aca90e157cb53a786820a66a1a9df34314b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b63c0a55a29e474bca84012c21dceccd24252e6cbfe248e41a927226985e1205"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "60485170d6d91087e1ecebc75a5c6a6095221d40db4051fd80814cc38a13652a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9c3a86e1a12a8e0e930e15154e1ddfc6be7935d64172702847f1389ff1eae7a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "901e8cb43a3d105c273a3037f04be1cb1aa7f8faad10a41c805812f9a23a53b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "c71d0dd8fbd1603ed55c90bb19e458758e4a8e44b3e4ba6bd6fb65755ba67d32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "fbde65f9f8241dfeba18eca659cd6b172f54d504240b69d7473580e1ff48c444"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "77bbba88e24e1481951941c1288e8056f3389c004ced10c197311051baa3f4bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7f7521923a0f8e534c045f73122da6f1eddc1da8d2033a4e197fe19c53e46707"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "59bad4b9c0aecb43794e5d11984d85ccda758edd4c1253be8b64395142825244"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b83523d2d71bf2afdee7f21488681995e447a9e6d2b0b148fe73ffa9d68f6762"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197800, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "de98159cb6c13d9327bc2c6fbb293ddcfa3bb8a81563eac5549789f414619640"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 197944, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "3a77bc4502bbcb2e9b27c263e6fa9b9f537a827130e67fc231729522c807a05d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "a50d32a642b4bd4cdd0416ed709017de8374a7d3874925c2961d77656ea05849"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190120, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ac4c8b24b4b218237a25b5fac3b0127907d64281540ec75d928cfc79f074a6bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 190264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "95218df3910491fe763d9f4288dd59f7de50679f1e3da143383b55772ead01d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "89a4a412a40c4af6d88a298c07090086b7626237233698abc95c3daf8fb00793"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ca123b4d93c1c349e6e6d5c8cf30abaa3a3871650874eb5847711a859b3d6346"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "45f3b8189a10010917f207014024bc5906f17d2324f6bb56fd9d6df92d149593"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7fdac76b17eeab721a87ff785d707367a0cc4ccc281a3cd1eabbae133422a62c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "70e8bee779d7367348c8af5b11b34597fff4da1937261d089d1973baab32dad2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4c461e9499628a32064752e622940f6c16094ceb0c2ef9db6b3498b8c427d5dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3784fb9615d74152f4a875574e144ca517a698509c3ce343f20bb0ca92536ff6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c4818bf350e064bb38725a46ffac514065f6dce9a043f1ff2f072984e2989934"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "9007309585d1a3e389e80e5748774c69f6cdde3de715b9bf816a2390704deb99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "11feac1aa376cc17dfb39a3d544b22dcf8f555be8a233d2a29bef1c56eea777c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "0d2ea12fa1a31727b558ec7b1c7550795d33f83576a09e866f398da055f09045"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9f028d4d8358b6ffe79c46752d401ecb664a854a4957df6c50caff4d54ec0be9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "090c268f363990e859e1a3a3ef95d5e7816eb3f5c9555fae4d89fadfc93a7610"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a5530d19e59b24ab3f42260487eab5fa4169a996ee55238daf89cf7cd18772ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "9f8b7255b09e99599ab8472fdb231e5dc48c42a1034f6fa674504f93bc889b95"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197800, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dbbbfcc721ee0a2311fe68b2475b402da45184abd120dd92a710240d7a164100"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 197944, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6772a10d8f87227d9c2da5df40795f0f35b7303940ec06d89ae32185a3fc0277"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "924bb8e99f614607ea5d15bd2036aedc717c6729200895abe49971227efe5193"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190120, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fef3003c50a62426d4c16a0a4d5473ec257ed9a2ec277eaa6ee06003f43a12d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 190264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "2fbca83e0ca96b4450c234ba84160b78fd36c6d914c75391db4fead7d83ab107"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "aba20b21e2732ee6149b8ef6fba86f5628699acd0360996dfc4f16c598af009d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "6c24b7539d87cc6bf51767946f9509f30686abc57dba09abf4a4d08de2f98a8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "ce868cbd9b5bdf9e4c4c39a72cd77f0f41996597faef5523107b5563ea5f35d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "07a0a97658d517a55aafed1679124595b151d729245180e24853ada31255146f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "324b7cc6959d8f59c1d368ae4f6b9bb64c4dcda2a088d2f64d3990cd0e6f4141"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e0787e3a2a2e3e43b8f7c3aa2002806701e0a8a8e6d8d308cc3f2a50c08ee244"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5e0614e7b590a40037f044cdcbd001ea6dd2a3a33c90e60f72797a4f5e276605"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c1d6258c40da093374206f605c4e7d9dbaa3ba47b8d8a173e1b4c1a606a679e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "e81b062c5e9c3de1ad9090f3eb69fd74ac38ca3a917c6b74715a4a53896b665e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "d68567b6882d8d49d56077dcf7d4c54c34ae121bebb35177997b0d031e510de3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "ffdfd30db644c4d90f46ecaf7c5efd5468960cddaa2f7b658d286e8a0c65c6f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "bac7f9b0b0a3aa6f001edc5f0306a72b4ce52eb61ebabba7a9b709e1fbcdea06"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "776a2c6b283d1bc96312cbcfc59dd53792548f58c0aae0bd84c6331a54d5bfba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "7dc632d747d8270164e372652191e0cd68ad27f9343df89c591e99398aa4f5f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "559b9135aa41998947bd517e7f871c1e437a2cab7c385aba56f67020a081f8b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "c0137e0af367a785c075fc7908d95271b5b54b17fa38c6b0b2c035af2fd67fd2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "0a8abc8dd96a2e84ba55bd575e719ea5e5d2f7b73b0a12cc5607fe6619621661"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "5d9e3601487d5f98b19010ee2d94e6ede058868c56fbbf2555a8528d964fedf5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "69fd5eaf0069dc716d110c624bf674ce89e001b4cb27b13369a85c70ecbc6cff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "1deab8fbb43777f86f1845b49ed365057d07fb150572aadb6fafe2ec081b24de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "d7fe9ded196b724d749aad0caf79effa563e62cca0d3ff04c850c960bbefa361"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "c69beb39a463c0b09e7a6b00055c168089f1c41a2715eee3335bb3ba65800efe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "cf02b0155ae56365c9ed8412e7a53daafcd73a2db84546c94dddec67a194fc0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "781e0038c562b2dae5b6a473f83899d3e10a019b958616da973a24ba0802bfbb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 197432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "e54ef4def90c64df1ea146dd0e3bc216ac1be23238d14761e1e8037543247be8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "d845c4dafc46aa2a3f04617fb73afebecaa21ac1889a1bd7844eeba91cbb5362"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "d5cdbf19a074b2b408bf163ac545789945b1baf07ba52ae65461df4d85dbd374"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b32ff9cc34faad5bda9a13f61690a64a3dc1511b3dba0255ee63e81428ca2758"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 189752, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4a4389d5e774b5830144b820a5c6f6e1596064a3077299240101e37e00deba8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "0f8fae4ed53e2845c8078d4b1c1a80c28bf0aa5e13501299c6806781387539ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "2f8ce1877807ca962f2ebb17e87ea07b2b153bba992b23f21199e1d59ab94034"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "455a9661f317abc3a8483f90eb3ae622bd2da1cd87b17ff7916448c3882bc6a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1423f794d5e940a57d1b6ce9ac135575cd991864d519ba025620abef08902bc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "536e5bd996873a3cc58057640cc222c3c87202daa6e22f88157acc9a12c4c4f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "763fb93e882203c85a554a1cbf2a7063c5309a7302e70335b27540f6271f37e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "5461dcbd24a81e4eecb841ce6fe9a9ce8c38800f1ad7b0f8ecc093f2896c98d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "168a110f2239fd11d6b386132c107531ef27d1bb76b59c62d3d195361e41b4d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "781bbe196a7246f6cb177ed26cf9dde5ed3d74b35ca83e11ba00b9e43a158232"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "78713bd359d17dbb6551125c7bf12ac2783854f0907f3c3e8699f8cfb7db9ce9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "ede8fd47c91170b66f75f4590f4c86ec601d7ec99b5fae199b6b76dd414b5e2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "433354de9c7982dc129081fa53f1c828df40f2199e7dabdd7f07fef913293f7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "a0722fc7d3d706f890f8d305ea0499161c042e94879de41e083d79555150b750"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "c9a213c60e004b36486c0d265788e77d9593174246124bd08eda9445767bec5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "487f653fb029e6861687667604825df816e7e965a3e166bc84441ef19889c04d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "745cfcc6ef6f02b22098ea6c2cf5d19fe1e617bdeeaf7372d20d6ee88d33eb27"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "30472bc04877473439cb2b956731d92e252393f22a17c3db43798cdca7fa69a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7b289b24fe76114e443056be374081953652fb82788326a6d469c6a0db5efe65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8277692a321f1e66f00b17ef05e85d656490cc27b7650ec4afa329d67886cf83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 197432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "86063446eccc0a3bab5c29a00000f259be81859d5d599c9797f7799ed3a762fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "ffdc96f14548beec32dbc8f954f501bcd516b581509aa7b36df8f3c04df36349"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "fe54e562fe6a769cad9e22c344b8b64475d233ecbb4894ed7d76e35328c17fe5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "00e416e9ab313491a1359e6e507351ab7d61f149d3ae15a6577bc270643afed8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 189752, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6d86ef06063f4da9a61eccb77e0e59418e726213f75b9b92a54b737acac5d3d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "5d394eb7b0e1df0e4b703616f463f8a32263eba831431092737e30b8b197eb47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "abaf73e9c19ded947366966ec6746638d1848513061101218e16cc20bd0c41e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "603745e9538b0208154e736c4d064b3bd8469ef401f38b159cbec9e4d9037924"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "ba5f789f61168e79dfd25f42c5d65907aff4c31cb26f8e412956b7c0d86078b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d24814515c91e8ecf394dd63fbfbc1c38bc8fec83636b6b8d3db71723c5db645"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f173d4dd53b53d8ccc504bb2cca629e0a2208d4c10ddec86d24ed587b05db275"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "7e5568bc8ab67a2cc0e93c0760eba60eea0db1d9942e48a38f54d73e280e2d61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ef4c969e0cf82e5ce187e4ab33d1991a46e69d57066a2930103ed805cbf90def"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6724cd34aa3c4c5a73fca5a1f9360aea65d08716ca4a7e5bb79ed29a190ea1ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "25ee7887c5e65fdbf4df12558bbad29d4f26b0be3ec65cb450120474e6023133"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "5e49a854351343e37e1a8ab376d9c90c8af06540f27bdd1047f48b060c7076eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "2a05ee0c552f482a43d9db3282f9343db42d813d03f06594296d882f14cb7b44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "a3cc2058b242731d2f9813b2450c2fe2a27a7de846d9335c0d0aa10384dc10a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "5ef8186a17048c8d57ce27a94012e6cec759c63a0239c9be3ee943d1fcf4db92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f363c0b612c80bb3dce9c3bba0dce3c905a5c005c56b4f6da3d40f5c1fa68f0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f5449f388a56d945ade0c08d0e14acbedfc9319b7130b69d6fe663a2071a3d6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "203b16de8eb0332aa15f0b260d77e12a11e0c7e6b5ed3388efa47738a332b31d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "72ca14194f99a89cb957aeab35c9860732dc6d154c086c2d59678f18f3d2a824"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "391dba8b19cc995a1788483a359ea9fd08cd2dcd97057eca61725dd794994060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "0d3f54046afbb9cade76ef171393147323977018f62fe7ff77c14a1939809b7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "ba16a272805f1a56fcd165ea8944ca0fe4bd83136dc5ee6cd15dbd925fddbf89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "35024f3c6d6d8f569bba5b7596a77f8e3f238b3e59954090ad95502faac249a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "0243905119fd12b9c74979a30a65876bc3aa385e1f8a1f99d68bbec272ba609e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "f07cb71c29ef80b6c907efc5b25b2fee8499b6a814652b2ec9c1a5a4ca1d1dfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "97cd444f587017e5f4e553146ba3ea7b9695315aabf274de603fa0a498063f3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "370a54e0354b5eb8330bb14471bee610d46300b45a725f4ee7a65f04a7d127fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "e58b5bd4ee0e45c0ce7afbe189b6fde60a8bebdb8f4acd9f193cb5f8b286587f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "b699d374351ba6ef0758c0b32e460e253366628c4e365de543ecffdefeb89b98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "fbe11f7e0b3cc06daa44e8302a5129ac8268ef0a8051c2edf05223c161011604"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "a364e2f203aed6da53ffcf428c62b76cb5c77729312631643905dc88942e8b7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197032, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b21035c4c504ca3f90ca1ac5196b1987b3568e21f25be69ab5669f83c8352f79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 197176, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "404c8e5e00747238ef70ba408bae9685317381a4da78852744aae906019308ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "b013bb6c7975b64fdc6037d96fe6d6cb3c99e6ad4c0603e0391c2a7ae3d16fea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189352, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "117b63020f48679a8c1129f1cbe0b101f9de84d59154cc8ab6cd4524f8aae690"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 189496, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c29afe3f0a7f748c0d0b2e2e08a6d085c7643f0a50286c489dd8631b60fb0af1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "f5c657a3d53e5ea5e2785751a7b1cbf5b129d1b05d410c38ad426417c2f74e36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "55995ec812190b6e697d7c7e947e1b3eed379f210f82cea9eba8ade8c71af34d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "4281b70194674f9e66ebe57c8b134f62faf033b1753ec25453c5d8e5c3eef3c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0931692c149fdbd77b86a56ac9b0518c1b2b3cf74b55c1c9074be5887e38fa71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6079b650acdb1b06b337084ea1f76f17d94949f80b774850f783e7d2cfde36ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "d40d7450cbbaddaa34571e8e33df8b285eede26546d27d6ea865e506e38e0fb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3340c8c44c94e41d48d21fa01f42604f2eb5ef98a2efc4e2dff62e795721184c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9b2be9f5711ab3afd4ce0858703bb8e8c0929e1e3f2f9c1abb354355decc8d45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "fd11f5f769e1146d441a3dc46b78f4398dd6599f029f03d6817b080487355b1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "6a6c7461298afa9e5d4fcee441ed673ff72980bff615f65b8770bd4a2c096ec2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "4be275656fdbb88c85934d4bd91fc742311bc2c4872878817653dd057542beb5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "85dcb66743d45bb850458804e60a99451f24fcfbce8cea99506505a57d9a9cd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "e5054f6a57ddb8cfc6a4f95ad30cab1f6ce067965240c2cfb645a89c73619493"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "813774e07d5421a11b0e393f504632a57fc1c08251d377ac6ef9789231b61507"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "0e2a47e527022b0f76e40f5b4abdceaf31de1e69cd045212ef743713de68ff14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197032, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "25b86fc05a4e55c3ac4e6b5c7e3f9a5fa2cd8d3cc407fcae273f3e33208aae57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 197176, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "64a3e72e7b130c0204302611549c852d387b0b6492d6c1338e605aae771da76b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "98dc679e81bc4c9f30619b8539ae0032b621a39d9f7be56034734eb09873daa3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189352, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "2138be093a1f28852f298ed2e90c4abe5a0fae781368d24a2f977f4599b111ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 189496, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "19f82fba9820043f74d2a52c06babc51173b8eaf44013eea30e304d04f165cbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "4d405f503bbda062cdcc503c9dcc7c5960f9cafb4ebd313d24299e8cb3b79c32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5f94e473430d6e0c488c21b75cbe0ae3064c13739983ed98fbf3cab4ecf02a2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "d10e51d0fc7125790dfed34d66a7b2dd19b0f2e4ad82d75bcad571c6d53b40de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a03d803fbffb00d2e8f23422b425ba8ff770821c93b152cf8041ce0188c6232d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "fe3403ea56fc13aeeaee5d92ef6ffb3c3d2fd01c18bcef6d96974117fbe84cb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "28d4e8ccfd90bbe3f66afe974f210ff332b2e3d1bd206fb0563d45fa8863e54d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 164000, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "47bf04d4c5f6092dd08d23ef325295ff4f2829cc3c3f1c12cfcb0cfb8186103f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "fe49d82238b2ddefc816465821c405824163b3db2cd66a466c9e70f3d0c228fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 164144, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "2a9ded41a2377757025996b9047e24616080ee783fdb72fc0a7cc6b093e5455e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "157848e8491588b40a4f1c5f73e4a4718597e9dbc57445fc14be43c958549921"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "2b5d32ddd8f1a36f044589906156751df2e80f2284dcdd21cf96c41c3d3c79a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e908da282db9339476e8af5107e37a12816065715e4decf430e386f8a4ac1bbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156320, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "95bea757076e4f57dd5e9cce89cd172e1c17653e5c7cd828da389a9ff9adc6b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "0776341eb0fb4776c5f0e658bad8f028782cf7593cf5170f90942e11b501651c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 156464, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "80b56e414a7a5457a985d430d4aebf71ea58694aec6ec5d9f236286140955ea4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "86dc9d6e5731d3561f90c0aaa8599e95877c8e648ae61f6077e13269e78e8f54"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "03e644496b29e961e0648d19ac06ee41d8786c4012a733d268a27b90a2e4d4de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "a6f0a1b71fa2b1310245f50a34dc8db0513eb643fc3859c8504f01e3b6dcd77e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "f5083301df38527fab93837be4d9b50a38e7241ed6314acc5317fca2bb864207"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e5b6d2a4bf5438ff70a50d679d524dbcb5ca980ecf84ee15e75df5464aa2b5b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "0f771a8b6c329ec615960ad9becfd0097692dcd9fb2269960ce78155ca9af16c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "00ee265bff1ebfb70608ef4794a2dfb89ec92edf0dc5f440b6747b8f8d1131ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "390812f2629811942fb507d9a9542d8e64f08a800a3e84433b04778be70ee8eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "e1ce70fdcb28a12d838bdbb20790072d6bd027a2ca9c5932e680f0d9b9fa0ff9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "8bf09dee03fade9b7c863845281548230c8e86eed9fbae09e6fc831f64a3a91d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3c22374a6e3eb6dfb594fc87e10ffa5c3052b012d7eb399eaa0726d0993660a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "686a8b55f7390c09dfa8e8d967cbf3e124ae744cf9f9e6e0538792026f28b67b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "61c41cde5719890f4f67f6f42eb636aec734a44df7fe9ee015419255e767b2c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c34598d3a5cfa163d3c18f82112ef1c3bd30ebf235d271e764c3f5433317c397"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "589d6f7127fa275883268bb2444d0e88e255ea4c93ed075daf01a04f006161fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "58f21685c554fcd32445269fca41b483f9b9a2d8d6df18ddce9a0eda58611b81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b56f6b51106345984a3b27baa3dc3d73cf7e7a8781469c7b4fd5bb79ee334918"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "732e19de2ac7ec17de854a1e90576eefd6a08df555860ed8ffd2862e537eec0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "61addc6c4a68b748dce5bef63913bbd5770e87752c0fc50839456365e1e532c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c070b9faf40fdad577925075e58c1d12053df5da43cdc7da1b5b1f552fb702da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "cd31d716941b073cb92a8161821b0c4b1a675541d71164174a980c554c05ee42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "dfa15e024d58ba5ef8c459fd81197c651c5c97bbeb38ff3bda3dc0130743d486"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "95e63b35ee0a99d3ea6c5bbb5426ad803c3b39dee9e897282b8666a3effe6ef1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "27a9907040e3efabca29f30bce2ff1eb01449dc7a139fc10dc5c3c7650f46ac5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "8ac68547cd431e6aa660b0d325e89b2c032bb10b1f2c211c2817c88d8c80b5fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "aea4a455b676d6ff4c817b72e7687fe80bfe1ce5824e190dd972a631313559df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2f1054ac317730b8581188ec1b43ddc8b8c0cec51b824672badd020a90714c3a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8d04ce4db1726dcfaf5742d2dbc8e92427570787226de381752821bdf47d2a4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a2f9335d9e9fbc21acfb9e56cbeb90df53817183caf86d90beda0aa88a4cc53d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "223914b64634cf11bcccdae5774f6eeec0df7fa9523934cbf3f745ab865c3fc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "677342cdd1ac36be0f835dc2347f44b46b84032e41ce04bb6b77a3f89b56391e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "55b593678e1a9fba16bc18a80e604e1975b3d6ede73b551f7d47610dc4a09291"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "f36a6dfb9332c0227902be5d25a7e0b51d8004d60e67086471820d0c51b32440"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "426e90af62bb463e4a5eb682d9658bc27c3a9953332d490bf6fb81a39889720e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b6172219a3d2547b4b720ff4bdcc22bf77cda112e00f2a84426d101bd5d0462b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "dd8194827eb1aabb656e8c87f2efdb25f5b17ac9b3eb8ece2d8c2b9df40e061f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "f5f80683f1b99d1f8c515ad47852d8cd8fc057f70ddb4a2802c62b837a9237a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "6c053a43c1e1d8a0341f2fc98bfa5b44398aa98747beade81a8a3ea95705bf13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "14629ce5d5b85dca09bb5dc6d51ea3393d267e6bdbe56042cfe1ee1cc8a47fa2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "57ce4da3d70fd24fda05339e2ee90e199dc92d13f7cc15f0f9a72622583accbd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "00bc9734b721ed7a6d107e6fe37c5162268cdafdda25917686e38d382bb72d83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6e0fe81d2441905b7db9f95322ad5edbfb2df1ef39d2c90fe3fe0f9ca42893f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "5b0ec446c381c36abc89b0ab28ebbf3dca52199cedbf98b8efab81816fcd8e3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bba8fd3163e78e7a87745f154a71e4cb8773fcda1c67096f88e579422144c675"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7e2517ffbff6ed3909e931eb2780fbb6d49ef9a454e696a203404fd35307e64a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "060ca7eaec333978d1fce3a0ea06c780a0fdf25e9ec78cb910610fc383f9c53c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c9f50d82af455eea69f887ccf380746bab34a90c9e965b6e43b95407571f10ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "e657046fe870211689cd12686370e5491dca7c7f31fbf136394a2e3c028aa1c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7c7ebb6b3dfe955b98559b8c7df4171e9964344471a7583fd7e5d721aab83320"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "37c0c88f963312f9522aefdf00f05d83555b0f6d65a02d3b2e175ba3031864a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "11618a23c5a8737ba523ecb3b67a5e654c62804afd6e226aefc5704486714217"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "15dc3bc9125e17e73bac76cdb2e1c93f2b348e80d22e6e850e40907555d944d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "f83c0e2fbaaa7fb6651f4fcac5a444afc768ddfe9ca361b875d310faba601a10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "11d50c9a51a9dce13ba8deee6b998061a60d08ce73e141b390f42e7d20204424"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5ef656a6cd0c2f0e1c05b218aea547cf60f1b38a48c10947238ab0c34d127bfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "32b1ae28a9692a1dcf79a8e370da805efdb9052117ea6732568d1ffe8875c17e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "be9c54bb3f33550dc2baa4ee9e5c9356de15c8efe799162f87ea3abbd1506b81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4f7ea0d8fecb94ab44f406439baa75977a208d3682c3abb7e47cf92ded041fe2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "5324fb8f42d2b99d9660f5be2b7d7467bc2efee93a8b4c92faf0e34238402cba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ee95e70ba6d8745ad6eaab5d9b1db01cf0f33b4d2d73ed8a45e2d824e43a20e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "3ac979f42279f857cb33b8112bee7b8ad0af931e58c7f27de92f3257094c2627"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "10796cc0812be3282dff61229365a6e3ae3adf41cff5bfbc1d59ee288268d21b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "25d2dc1bca192b09b44a1abbf4d923bae0288771a4ca2b99f794d0b9d15381d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "65a9b38e5764f459c97342a3960629a3984f7a3ec749b1f921fa024330310c7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5711d791eff8941d431f867e3e5aa491ecd2ea8a57bc5465610c0891afee84d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "5c0831c2bab6bfad152c230e1816c05e4d35099834536999e04569cc2d81a07e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "63c0d628828363ba2eef23e78b8cd17b791182e0d555947c1f18c55d797ba217"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "953c7c5a6cba66ab08cba945ccac57854e7520305f967a2b50d50ef262f5af29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "6456009bab502b0334b540c63aff4738f8fef3c508409e213beaf9318fcf35eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5507c9db34fb6154808e3c633f7e3a087275b1ecfe308e8e6e72c23a21707c33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "e3dc5dd84eb5323ad05f5e123baa6fa8f6012e3a0f232dc52ee9596e61259115"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d78e36bd477175a1edc6b0e1147b5cec907e8dc4decc027b92b3f2fc51757130"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7921fd7635f79c7b684526eedbfaa4e09cc2867ab8ff7e15f15998333a9d588d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3b4c87b7db341978fc72e2959815263348d6190f24135b2c6bc2dab10d5fcb0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "507e3b76bb590e8982b0a5e0c8d4a47ca14e290a529d9888200bb73a77cce8c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "154e65f8375342fb48f4968262d87219a00f648978aa502f7f92c89d6692b95f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "107bd3a5beb8104da49003e776c0e91b2a52855bba0ce7d8de524abed68f66ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "9b9c0db0f63a406c80bfd1a2e09460932f123fae959c877b47a3350e7a400fed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fa69c748b4294f91384eebacf853d1646bfe4b7bd6960ca3b919c8d70ef758b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "48c5a57d1100a1d308d52fa9d37e834eb3c85504df1c2c17b9f77ea5c3caa3ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ecc6212dd351b2cd495f99eaeb9c47c7c617d71c52fd841fc5236e9bea6da309"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "05b4ca394cc84b449a3a544df618ec3e8496be5db624e42f2c1b32326f2995f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a4918aabb7304c77a59f4c0a7fc26af4a18ecff15a1b51c2dcfd346d201e78a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a8b97555b1c856b98aa135c23af53f4d3367b6eb4f8a4e648242ee6f56409656"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5df38df748acbcfe1887e30e4c0bbc0814f601fc9f04299d2955b63821ac09bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cf2d0296bd4fe95f61e01c75fa13a2ea625e7ebd0c64c48fa61a23ec1b64379e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8d1978d3aa44f566fb76d06cfa37fab06d10623ca96fccf9273fb701dd103d04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "69df07bc5faf9d121afde7cef010eb73e448d6a7f9587560170a3f3c8ef1428e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e93cab9ca8a6da461eeda983a19615e4004a9091f5df99622bfc5041b32a6c24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "a4132bc9323f247623c8822e1bcaee1f0865290eed4213db0b928b4c34b67542"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "94b4996c64f6db63f37e4fa81b524691d6a732312d2ed1b1d138080f60ba099f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5842bd44a5a89e0fb622c538c278fbcce45c933df80f529a818a0568bf3dfbc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "d35d7e63c866cccb2e3a6068bb84fc6ea3e93fa38e7ac94dd9b35528a5fdb820"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "b6302461331b3ec55b0a83b3e529f551593b9c7d319e6608c67f4a28557abc32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a14ce737ecd9e0819b417241ff09dae8222946ef2892684bef797cbdcb678e68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "48df1df671d5bf6c725b242bd0ac362762cc20611a5e5ed7161de614b2e02068"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "0a6ee81cba063abc04f907364ee34f12445f388317fce482e3367387d701ce32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "dfcdd9148c8fdafdc6844eae4c6bd4e523888deb52418c2b2d61454a32573dcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1fe0466c87814507a1457d1d95eab8f83a4de281c2bc9cee29223935cacde72b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "f43fb51a23e62cade440065930921e1d146bd449e0cfaf4298632c7f741ec9f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "d6bec74a2309d7d828f18379a3537a702a9004f09dfa068969a8ada6e5820815"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fd0626b045a75ed586deb31b286f3d4112a7d03cb7d28dd9b7a1bf7f2b9a5ba3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "903fd5b4a9f3dfbf8b7a97a93781470e304b36fef07b7d351e01ca9ed158ef93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e4a79073c38de69a5193701fb0729c1cb6ef5c056e427d126b64816f7175a3e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "8d5e141d200681fca7ece1b40427c18fd8b3b4c82dcd6a6b7dafaa2c607efb9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "c639b92234e16ba8fe248f7c1e98b042480967f6052df87fd54033c736cde68c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "99520eb954c8534a8f0ff24ec983c15cecd806a6a97ded2f4241e1129bec21b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "49375101127abb62487ad866e77501a2d699fcb2ddc01fa69fd480a9818f693b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "064777a4eae5f8c6aca38df43145668772311c3dec1f07164ab7003b18e99dfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "44ec993c594495624c40d62e7f6fc63ea6e7411312a412c9c8701b2bc03b4656"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "b4c51dc13ad76628e45cc28113c76285b5814b92bec92c72b44ba712ce44c7ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "11fbe42ad29c0016bd6e73b9928d68e79790cc5cb8a30bdb534f1f1ae2179e22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5ac736a44544b34247dc4002e551ce5c58b29f59c1828b053266d308ac01000c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ef42aa3c21eaef81293416c8c05dc47f6a84946352b04e9d7220098da77727ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6ce3fa0d9884ab4d099e0f544548aaf4d3bf6e45093138d3c85fcf4635315d93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "48d160357d41855dad0d74cfc732df15d56e1de73c165472a7c042eb3dd16af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0f7fb4d1ea66aa120fc510151cdf115574cd2e803ed35664069fa060dd621311"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a0f4597dfa5618619219906153add8142b0f6270c6b847c5df08e6ba850032b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "40f29a506404c23e74710a09c47bccaa6b7675e47918acbc2d4c9ce341f71489"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "d50c92b846c745a4dc1cae31d0885e38a77d81af34ebfc028574e5d764bdfd18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "a4a24764e4bafc6dc7f5c906351c6227c9065547f631be38fa660cf75f860c1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1f67b9a6d793374688d9103ddc37b8189fc3b3d829dc8ddbadd43fbc1d251adf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a770b8dc08fce95f0ec7998514fe09176ebc6e503e3da19f3b9e84bf01a65104"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d64a8bd7f516ad9352ff5d77eab6478b6a14c152ddbded6f4a9ed1c956c5fa50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a90127048cd17a1237ac37bc9a8a154fa9a856d263bb47e4bb33efc02629a566"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c4d0013e0df30b68d95569b11fc7e9e2e5dca1270d7e330747543acbbb668945"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "735297d189b43be3faef331a152775c4da674d2d96200cc4ceb56322062a3882"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "23b3ba5e7b513a723676a6af1e89077e786ca0177eeb9e8c8449dc2c3fa297b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1701d35e5b404cc0adea41931698ec67b32349a618587d983e3a6745172267f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "607c92589f41060837b871ef1409b4a3fe5771b137d9970b0123f7d6557fc612"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "09da13be0ce9c098489126ded13670f72b25ba9949b49bbc781b57189499b4fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7432ab462a552af70a2100df94cded41b0e888ba9cfceed74d1faa42828a6985"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "7b1e645f2e699f3270e8a44a0fcfca04404edb5c7680149bd7c1e5c3122d34e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "bbba2f80d80170fc6f84a67cf3eaa1593fdde1d5e85367d71731dc56c5faaec7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "f15778a8eba5f54e46a6a7978038b0963f99a4fe93387d5815393d39184a5e84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "65471bbabfff790f62087bd5d454c4d0d9f312ab57ba9289e4093cabcef2a447"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "81c8e27d3fbce15b07a152e7d9bea3ecc1da4d2ffac58d454dc81335495fe235"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d3783a8665ff10ff8d495b9aa77a4ed1f2f42982c426ba418aa83560d34b6cd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "ae706202be99e5270da6403af25a0fad41f30dd005dbb01207790aeed7523cac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "777dfbcd7ea57298def73910da8569815aac274bdd0dcff5ae6d86223418d21b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a3dafa7e53ad621bec22f9a16b277c22ded789a6c80408be1e705d7f4f1cb7ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d8b575bc81fe1f0eb7ed9e226de9f4a8dc9c6720fcb1af1896e6758874394f1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1d9ad6477f3746afb65cd43fc758c07e49ccfa86450d9fb5d81f8cc5125407b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d4c818c4fd97f5fc3d1ba2ff83d2b30c0639444466678755611d1941f0b272f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f0ff50282473946f46fac36d5b4c45d0a062e911c19a6e14b0ef2c5380875f48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "d7bcae58d478ca7fcb63cfd21a111ad991375c3e9a1a1cfa53763d662d3487db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "14e89d34ed6dc5f3e4afd7463641754d05c71fdb626172113a44b8106078b986"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "8c1b73b72a498f0280d2b1613bec1df6adf85b257ad8c2aa9d0b6c80acde124e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a862c9198bbfef97313059ec7148023f993d3b68927498c2c8a56f173d56caed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5d572d9fe62a9746fd5a9f52dd18f1dc133d4bbb2460b6456cc377c7de988616"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "2271126118d29d60408876fe662ed035ae9539892ddf090ba2e31d7b7782d003"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1848f6a35891a2e540d60704f77b3271d89633454dfc02aac9bc55b9faab4d73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "3e1a7cc6e5fc54136a7249cdd812e69247cb63de3942e21a6d18bd61363f685b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "ea7f0e09223e23965787cdecb860f80ec1fe33b2b2b1959efc66b7113b83a7f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "875b04c35e1c67a29dcade12103ee226e7682c551d63f2132263a86b0c530d43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "06c67262b32cdd852c537aa77e5add4cd8bc4aa92d20fa98c0ee297cd3f1e938"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "03a104df492fa2fb791e5a3eab619ca4a260fd0c72a3e95248fe80b05fcc6840"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "70a740e3faf2af0d1c615f13a125a5dfba363f77c57cb0ca67bf7976398e1bb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a820f8461c8ec125f960f529e0afb99198a4ba53e04d5b31518e1fe6fb11d3a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d98e1287ede4d4e6a08af84cec2defcfb2b1d36a67a1d64b2cbb76478e34df53"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "0049deff8ba8453340a0c7cdfdbfff3c11f182e60c8db7e84ce4847ec071ca4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4837b51e3ba12b807041addc3aa74c1e841915942a4e9c969de57ad57c27585f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c11cf3d840670105e00839c87f3e643c0bf2ab1f771b76d7192772e9f59ddc2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "928e36d5341ef6c12efeade5bfa89e8c5e57988149081a9118d94cd6bfdd44ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "10d1101784dcd94254ddedd6d0beb29e7defa21f16a48734f36b840114a33c91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f21398693cdab5b75407b7c0814325a43f04242153b03dad0b95973fe59ae5c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "161ce9a86a012abb9e6002aceff238144c5aad2ce686616c1111d8074a2f86ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "0098fc44abf9d3cf8c1c0cc9b3bc61fcfde6c7a2b532c1286bb99fabdcdf3dd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "78aae5cad5d97f224da922e31da21e2b5c8c162a7eb7257be0045779a7d949ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8f04a6f491f0dff2d53d027d82c078c41043d2a7e4b86e37b410175d313a7679"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "c1ee3ff5ddbe4b97c180ad4ea06d81f92c07c96292a7bf19a5822dcbde7c7acc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "a89a76602e87d3f0cd2c5cd97459509f5b9cb38e4a0cea4b13b457aa441ebf52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "07d9d4e77115ef5809ffa19db2e92f6eff3336b66cf36297fb1a8e196f7c019b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "e76132adca4a31d5ff9d871bee443593f9689af939046355b59997e62c378bf6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f3361a3ce6c7046a3d7edac64015c9caef86a64f6f508743acdd7bdb870d1226"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "54a512759b626a74d7c67d357afd45a933be190317477d8f5fa80235a288440a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "80b458c6a591277a6ec40746813e2e28c88d47a7f2bb7b56c5d5465b3465009d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "c09cba7f4c5386ee3a00d204b1b216c9a5b9d1fded7281122471675601fa6f5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "97b034e8918fbcd3ba9b1c46a0d19303ea72de26b3ece5239db728e64a2d3a31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "03e2c4667f42a19c627cee284bffce01bdb48ee9f804dc8a38ebd807929bf9ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "97a02ea3a63c13476268d0133aeee859eaa90b9eeef3fe5ada5aad7a7785d5ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "dd8909536bef1a2471b2a9c07c28c03f63fc79d6316af1e1c68a0eafd84300b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "1341e6f8e29b614e66c2db3ed375c992229598d61355d64c299d9d95dff9bf1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "a368e69c73dba379ebeb83e699c315bd60107fed62b17475fb6222667f485cef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ae2995c800857e044500c8f0d009be0e44e92382f91213513de3eacd69a8ef92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a61dadea2da2810104209ad9f9e725b241dd0b94d783a4f6abfcb816e4aec78a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "87a39842490618f1802968e771b1a6e9e5d18a7e8674a695954195d8becb8472"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2c6419880d7e31627ec141370a762c2a2b40e1a0ad37c44f8a9eddebcb95f051"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "d16aaf8db811ada461d309fa1b135d26fb322df509ad0e1600f50b1c86a390d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "32b065276ce67846573f38a6ad7ebfbdd7d56ed13fa898362f6d6f84cdb8a61f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c3ddfb816f212f044f500840ebfb540799314cd3fe3f1d687fea1bad35d892e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "160ef26943c53724b8d2cfa0b9c14e0b1071b8b8ec6949382a27d292a29fde34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7e02fe26f61bf6b481e4c3afc8aab2c5f3e0deeb550e0e955c9ab16479d41b3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "d7fc8aed1e91225ad1bdb521c2c723b2c0a6cc96ef060ea1062d998f9abd59d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "14acc03cb3b846fd86b9d7b71b07293b588aa0a86f73d82030d669caf8ee87ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "1bec2d8e7946133d20c2451f715b6d5a4882072483cc570e78c808014839e59c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "71f1cd6b59ac287c7d9aa52d1ffc111025943f87ae870e494fec00dcc21fa268"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "516be2bfbcccb8502426bb1dca95345b79b7bd7fb138bb6a9d0f5cfbc20b8beb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "cbf74815b3828886304cd012d3cc5f89def2e7cf02eb3d14d690f5a4bb4ddb3a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "af68b00a0c6d060c4fb5c8a99bd278ee08c42da75859b565d2eaf5c687984b37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "713be9e94376a7141fc21576e9db483f1dffa3831ada55f9950cb23a293f5e2f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "c6c5c9ba19137bb03fba6c6994a319ffcb508f1c4d399bae1c53a98ac58cf75f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "408e4b5465749a346919f87480b86236c21b3acf12ceaa974f82f592ef7fccd6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "6f9a01e71952fc7eb2a2b7d522b306caebc4b1e7cfe6160c5134a547c1e59b37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "1f35381328bb28f9e5959a74fcd1f893a0dc2c05e301cb6fee14eb85ced099f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fd5b0b7ef04e1214593c566eaa96c87fe31142ce72a138c734e68a252bbf9a33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "335a685cbcf16816e48ee401937bec4d7e32792cea49332eeaf8898c237025f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "be8a360a5e2576c6f14231f7d7d0cad8834b6a31406aafdf75cd6eccc5d97e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "16982339c47cbc7c5552380aa41c62563c6a3939e56ce43c7f463509ecadf3d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "bd358ea844b4dce4d8423a6929dbcba3128c9fad4f73f37f994b8d4fd7aec258"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a751a5ad505d223f3e39b820725ac6629bc57c9c592506a35ab2be3391b2b050"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6d33c2de24ada7d3d3d71a61cd14cae6f9a117a220aec31ab615be6c91ee570b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "eacd21ae743ad361f75d743f88538766157b899c5cd97f3ff2af0d96189829b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "2334636ec140349ddd5529e1731486dcb7b8ea8117372073d6d13aa6612d70f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "93bdb9e48c330c3f91538267412526f4a3bbb6a8fddd6882a62a1637cb5db4dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "f319c7d9cd4ecac5581cdbb748a2a8f014868ef88ae6bdf0a4df11daca450a6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "37ec61af6176a32c5c911fb14139286350aa4eaad891e5d104ba8f398fb9f66f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "fd9eb14f320afcbf3f6f9acab87d0afe5c55847f8bef905ae7f62f4a1c04cca8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "2a7e4cd4126a880e3f148f65a86130c2da3303c2d493e2c8b5390ae61cc20f22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "a17f3d30b8e9b40caea1661f8d25d243aa90bfcbb7db6f2029861bf904e4e87b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "e69198989fff78f375c9c85740e4a1cef0c0ac9020e1585b1bbd59f26935c933"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "cd758160bc173f449b674d7d6e2929c15f02b44573fe301959b39f83b1f2affa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "782ef3ac348301c04888eebc304e76b715f48b56bd841912ce8a58588975478a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c2d6b9a1bd88c903f9d831e747400170bb47ab5ad9d5e747bdc809c48c5f01fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "3b1241a5017dccd6a94d053af67bf9254c745af54c98822e3999fc0fdfb84e98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7bddc5f7a8bf6b17c4360a329c2995ed1bbd5e073390c2814fa6e2a3cff7f30e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8458288397bbf9ea81405dbfdc5cd76261b5bf129a093f486b5529905ce82814"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6b1095363a7ab392d4834876c03b6938e06e7f03f0d1e6346ae2e8d28e76ddaa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "d2db2fd41cfa1782fbd8eb1d29d2bbd31dab406f84c5554eda19991ad7d24143"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "35101c7748300c30eab1ab9abe9af99033f3be7aa33d6b06f6d0ed4c2721a3c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e022fc8c57f0cc20c2c2dc12b1731cde0c7f702609ecd4147b2cc405113911e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "569d33ccbd1bd45d91719340a4f7b3c7e158bb018ea8fe4375dfcc3d7c49594e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "48625692415653ec1f475d72222a745a999fa7982821531d0f627348fc21a76a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "ffa1b95b70c8441e8fa08727710fb104cfbe4b9fc8ae60d007898ee50cd41de8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cff550fd647925f0f086c7239e1bf020e647c4954351cb242f30b9a9d9d662cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "deca6e3dce36bfa4dd965d833fea3f998e0805ff91a1cc6554931c983a3c93e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5aeaf12eb2bf600b81e8d2abe915c9a92def14b1ee72e32c286f139a9f494899"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "51c106c1648a9129ca24749e9911cbd6a19a3421506ebc10830d2d25642007a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f43a5144232529fd455f62d5836fad8f643e9e0f005c7e277974c2f723dfbf78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3d933446539f23839236720137be747198f30b0ff3bebe309c81de51e28df8bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "69a74749432d4929463ce34518d7ef980def4bd5c7c63ac950c5455a44f836fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "c32e73e7d6602795bf92a5761790aa71b72961c8f2c222ce4a5ae1fb491d218f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "feab3e76a9656900c115cdc574d44755450ee5132eba27d95459080a35e1bf45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "1d2d221e3516d3795233a9890871df306d2eabfe7fd08727bcdd623efe69548b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "04119a3862bc615e28af0d72e5c183bc63f972ebd588f851642066b0ad2f3b92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "cf11366071eac5631cacaf99083c02fa7bb465160f60bd8d59060701792522c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "20f6215111fe12db861d7491ffd7291736d93bf94ea7093f2107b8d03d392aab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "29d241104d761789a558905ba75b836dc585f947875a56505946d81c0bcb1b78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "86c84a4ace7cc2580cb66b9645a7c70835a9ae4f670b070b96b2680b4d77afd2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186024, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "7ca329af237fe06f2db74a810ec49a53d24b789a761221e07d5dce27d36d5152"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "96e8ed8d31d6c724ef1774ac514fcec9645e37db8ae0c5563907178747f00466"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6da628ab5cc2bd1a9e7417932bb4d0f4788ddbd5597ade5439af68209fede671"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a6b7e72d1185724072b8718877fd625c038ffe9db958950940bd7ee760add052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155808, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "19363b2d30523842f8ea1045373cc2241542a26c1d0faf08513bb1f915d9b892"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "b44292275a722e916f0530f276ef607b0c8ad99cb413c812c46854742ebea6e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152224, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e42f14a5e8c41605200b29caa7a966d4e180ab417a64b83ce4c322cfcae768ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "dd61e18e9e3e582600cac93ab06ed86e678c109c2e6f885ebdef9db72fffb5a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c64d490f4be1f5b7c89895d895ee52f2611a6f35715ccd674876cab9cd5522dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "fbcd9c9be8e191944d07947395ab1467e098b4ca67072c641e643223def59d4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "646b3453481b8b73802e3c11e8cf66f359d6138c6a1747ae134b2f9c27b87ad0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "d55db08d45146055850bf1d5fe24bb97b2103ec4ffba6c85279a3d6226f48d3a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b80161ddddff41bfb961de9fab340312e1640e310bb4d27ea07bf4bd2a0e3d71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "894a1a5741d35cb00701b818c614c9785f96caa06c38ebc6dada5f67e63a616f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f5bcb1525b0efc49cfa8f266a98a92a0a71a93165cdeadfc9ff50672bc83a958"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b9374108ce9faf64193e64dbaef1133de781c8b3fb1bd0679fbc991576765eb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "8aaa65ae3302a3557298776470d7c53ed664d6f3d83c2fc696fa1fe6f0fac44a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b95cfc93fe62bcd07a7428a8f224809ab65a5a73fcb340f88c0faa2ff9aeec2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "cad28cef119116ec64c54c697dfc009fa9916372e6d3aa5fb928bf91c5ccacbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "42bdf854976fb736599a5ce9f54445243760451372513ff082b888066e58e117"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "06fe3d6b713f27db28d3b892e51548bc0429ae15d0bc6287452491c0ef69efa6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "19a1f1817f582753fb3946c880790ee84e69d1784686df72612490e1195990ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1fd2636c1be0d84681936c4b96d3c4101d9fead468c85b43b394bf19455453d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "284e77f84f4d03c225f92226d84c236c018d434c1573c93caa5d536e92d552d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "fdb16b2ad2ae240048e8f6ff403e5e8cd2b2a0e3bcbaec1c9b7f2e48da26f87d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "ae149f0c8aa67c87843104a3796129e4bb3791d58d82feef32705bc6fe8f2c22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "a2b94b9fa0793ac21830b02e591e437acd63b1ab940189b81aaff1727e9e9608"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "c246f3f1703a37f2d1980cdfaf7eac7d340a254709a80823ab88601d3830486d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1e97a3633c83829be193fc2759858291301863305d0089dc47888837354eef17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "77c745c3d9ead8d4bb2c99e6b87be4f834d48ce1a50dc14071d084608e3ea1ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "a0ab900fc859a81123b99d0bd774ce389649f5221cb4065f9775f3e23112de3a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191144, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2f0900bd370ace63a54c73b5958bb6d172900a2daef97676d1d49976acf1b2de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 186536, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "c7eb7f0407daed5a5cf98c5934a3e33161e66f520f491cca9e78c5ab8f2ab29a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "6ef83e790fe72578521f5b81f384db13863abe8049261c2e37ea9f3d71f42191"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "605c65745eb64a8a7faf4854e92fbf6e9d4129d20980a3bb42df288b8417dbb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "e0387a5401b16117d44f19733cd5fd498248af5b4b1aaa9678de39aa377c234e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157856, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "86be6c5ffe0912c2a06584b2ce438a4d3a1c13eb0586461ec9a9128a8e853e19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "89ca82779e0daf7331224de81d1c78b4a2edd631fff07cb052c53288d13b22e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153248, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "0dfe7b0a29f424fec44f120df430ac523b19dbce881a6f7d731188ca7adcc155"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "c58db7d36ad77ba3a1c78039f9483b8640dc84204d92a314e76d36b87fb1e6c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "288e9bd47734ba111b5e00daaa99663fd7f60f8e470b8e08cf796435220d2875"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "ce94930c44dcd9b1870691f20b8f6c558f1752cc6374d6e6560ff49c0c591e28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "5733f05d1958bfc66e8a647825f08f0591e492dbc63a8353453b8a093463d7df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "f27db3f29ee74d9b77fc2a47672bedbd69ad746891fbaec7259d3ef57cac1910"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "cdc941c9c1235bf2daeb268e8e011df46371e49d72742c5ae199a409070711dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "55e03bc08d3729b49a7b349b323b5c79afbef32ef91c4c8295a20ff5677530ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "f0a0669013c86ca7d9d4f977ebd58e31d01246f6a3a4e561d39930e9cd11cd00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "26953280e4457b6ca0de4a155a8a3e110cdef997db2edf234f360ed6a1445478"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a470f0d3d37b697d7f03f99a3f0939cf3654677fec29ce22de0e1faefcfb3b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "03d374ee2c5f7c0bec6e8f5931a1059c06563128168ec5ea2264e8aab98f5107"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "ff62d27c5f89fcbe592ae07a280b408fba80e286039c8b45d31aea16d434ce78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ebb08f4462f9da831374506c19bb1e0bf3ffe2becc0ace5d1e96a53099eca73c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "85249820e61d9d25db8b892afa7f6778d5f71b1001c17b0cfae0bd6f57ad1bdd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c0319e58718a437aab3db52aec8947f18ff925ae7915ab68c94388ceadfe1afb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "300112392319a92ed7899777401292042e52cc2878d05d687ec4a286345b5178"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "2ccdfa739250d7dbb7661b28f3cc74df9a883e985a47ce4f72511c82868bdddf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "312dab4e691a49d3810a6a987cd871ec3794219b061bfa39c06d95b7e8bb2c67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b5e83d963abeb7aa42a1469116c1be8c00e1c4a0fa8b31f1de4c7c7acd38b502"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "9d4e85b801352e993da5e5df1f829c13c6965e9f5bc0173714ab2950bd258ff3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "83fe066659782374fb0913511a145a423aa97a4d3546ccdd78ab7e6beff349d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "f010bbc5b321f3826670ea72351bbba96b7f1cab977e6338bde79a502af6e336"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "0eaf8fa68e867c1f6e636f0fb35ac5cc319a99f27dbc84af4aba8d88f6df7336"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "4998c57197ca6fa93859de9d8bbb02d62ba28abecd1eda308965cf12264c41de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7c37fca5396207960dd572a958b4f3d160a1de609f2805bd90fdbccbafd0fc57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "f3139d9fc683e9ba78e05ab1f99b1ed08071e14718499e718f87629feccb581d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "11621b9ba316364208d31de8d5ecdcf65678476214b7c3ae37d1c3ab5efdc720"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "daa33dc9921ecfc9c91361e2c8ec477595fb9d5b2836e7f1884cffa42484a5a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "6ff12f91bbf7512d2b14ea483348b5a457f0b5594672d0ef02a9fb281b2f0615"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "8402a3c89f06dba58e3fe8acd3b8ecf01103c056f1b0d50082a7289ba6f021b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a436caafe668a24d93082c7c1286d1a530ee28ca8a0fdd83e51f87f4ed7224db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "bfad1a4a0cc86a138a0ab8755c3f37b8fa23dd2f187d27856561477ad5f31e61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 189752, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "bd9f3e9a42c11c0ddca7751986ecdfb0cadcd43c02d9d27b333c81f17de7e345"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187192, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "7216e5b856113e8235fed25ae19ddf298584c3c9cc68012b2212e92aadd289e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a9b511ad8b1680184488e65c524b2481706eb1372758e7768bcf1e2f589eae11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2cfe371c4a8f406f9987d1f6d341c503ba79bfaeb9f53fcea0a390686bf56307"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "b99bcfd4be16b1d88895ba028bb6a47d37732598977c96e014bf4bad96234bc1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 154928, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "32d1f91630a9cef5a50f241010163223e4559613404e54a751adf2d626ae7704"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "efe1606a106d3672320a7dc41ddb4f3a01ba769ee19ecd90c2abbaa9891f75e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152368, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "209d832b77b7085ecf0976dae7e2d4b57e4994a4db5b8634f75612d9c73874d2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "376c85460f1639832575c0457333e297141ac62712462552879def5eeaeb3e81"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "0a5a2a18757a69a5de03712ec84057fe4f7a3cdd54f6564fbaeb544aac1f07c3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "b975e4f2bc46f3be8f50a7a5e4c0e2e27f95c83fb72ca2a1ee9d5079ebe1aa67"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "256a6a6052e313158cbd4a21cbd0199b70e0691176b3795679321037bbd24a40"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "25f6addad72a136ca71ec8d40febf7cc815b37cf3504758cb7be7f7bc7e9aa31"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "5f1c85d738846fd02d121fcfe5421b1195f23b47886b488b6aad5b1818a75f14"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "25bcdde15cbcf2aa7fbc875386f90fc07d6f635d96fba36b77a3f39fac04005d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "7629e9c9f1811e5c4dbfe4e5c18bf685edbaf08b5f00fbe99717940b6afe9f41"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f9052c56d52c8f2b7e6c76929d690755cfefed484b9e96c0dbf6616307f2d2c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "37140fb85efad28e5ff660cd25c27e44c4d5a5cd561fcf2177a00e25c269be66"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9118fbd1285ebf58c0f3eb915d625d76152765eac43441492aa9c174b66ed8ee"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e11f964b46d89ec952f62fd4f955117c6edaa223bf26e4d6777e4406602e4a52"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "8accc630f3bb42266601d44682c853a3adbe659edaab92b639fa0df1bddbdbda"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "41c588d2ff0fe41c92d06b94e254a48c374e2396c4522b2dab04f92a145ff826"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "86fdb61f45e1b66eec2f878ed0a41f064ac566ddd8e2416958407c90c23d9c0f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "22ea3340cf370dfdcb173b3663291806189be85c4ef85c5d9fc7d910667a45e0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4dbd514dfb085b67dc89cf2d0a1982063e7b2b1c1636aab320599d182fbd907f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "26524c98f9f8056cdfb9c975b29ef70161f41504b8215e2b48b8b44adb04d129"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8b6df909201e2f06fac1a0c6604e63a8e4ec4343c0201282ccf45b036f744111"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "1eee5d4df4ea69d0581f62e2e23415d33049b5c82b0fe904c69ec9b83c568a83"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "f017c88f6b1a3130b6801b2c01ffca9b5d7fbd5c89b0004e1bd9c06a2e6c266f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "92a2b4080b2847939df5ef55ae811ae9575d91616affac9a618ef99c066f90d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "7ac1637dd9625dc30dad84f959216cb6728df5fa9598149273f787ccab46c6b5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6f198b0bbc0d1014614546ad9c65532fa4f0baa22b7f1dd089266a961e969550"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6714a08e9ec30de5155109defb4ad3f4ec90a449f8142eb661186b5b7b1c12f8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e57045003bf87bb4a8cd8bc906bc7d63b3c84f54fee95375fb3b2268a97e2b72"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "aa72394b2911b985bd225ebd73735977b1ac8ac5d80dfc2a56aec8b34d0aabc7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "1ab5d8448cea81e5407e60004c056bdd10c0818a1e8604274fdc58b7fc5b5c29"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fc4decfaac88bb47b9c8d0ebe1814dac77c754c8aefb931fd1f1ac5d3774f86d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f8976b822ebd5594502a6e925488e67f759699094d9c1a56825c197aed802c71"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "2a0a06941a263f6b5ba6cb7315f4e09c1e899265ff57483aab952f0ad4a4234f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "b64ae92772466196f1e1201b177c6c384891849019ececdab801e04ae6616e9b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 179288, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "3ea4562eb10a33bd5df7c63403326aca78fd49d9bfb0e8d3d54422ac9d6a2a10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 172632, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8eb807d24b240c145cc3afc53924bcb4f9f9bebe3c917665b13deb0dbda3b073"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "2befd3db6d384e8edfcb81a18c46749b5aedaf31ca0f41f89f74f7fd5e339c0e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "35607503635de156a8cca7ab043cbb38156211f5f632293a904ac07d9affc36e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "085384b9445d6abce14e5107cb43a2171d0c960567c16e3310dfaac32e803931"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 145488, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c1bbaed6c931a89d6368bb1897e9439ca427cdf36bd83d2ba7e2c48c9f18b7af"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "31c01825cdf8ea497b842f2e426f849bae2274f3164ee1732ae3312cd1d9dff6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 138832, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "f35a1536032de5b1912bff9be6b6ff8e3d0baf9996d9c9e5650f8a5894a72fe2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "08a8562a64917bf67a6d38065c1ae5ab58dba5ee67f6990c25f004df0ac1b26e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "965442b5452e7adcb4fdae5d2ec1ee2cb57254323d75af71349cd7ee70db4373"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e77d59c698edf68b153c8ef216579ec57b9ff9d83db3543894103924858cd666"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "dd2d7d6f9331162d036dc08dc6eee879d5e1e1e58c916c04a49bae424772ae5e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "d02975bd856022b92f73b4167c6b6a679ac0795e8d196e71cc5a5e3858cf1197"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2c13e7d66b8b816a94fc98ba7b784c8b73afecbdbe85a91c83a8f0b976bdb9ff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "aee4cc98f1849f6ea4022061e4fc1eabac1ec2b8ba0b11409db5b5ce90bb2cd6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "d54e74615feefc867778c8f66b3ed79fe7daa956005e5d10ad51ab2c759b334e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9b31a600ebcdbb46fc28ed3dc24e333f07f95f5a3f924b19b21df98d2889ac84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "9c2d9ebe7478b08053d99f6813e5d2e36f0c386013198567638a5f4e0d614507"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b5dacb35c6adfa5fd6873cfe58afb0aa61a38dcc6c91f33b5bb0a15a3a26a7cb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "57e9b63c7b58d84d4cbb71b0192846810e7447b5ce4cd1ec97e02cb9be095451"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "25b81f8de30978a4f7ac326a1a00051de18f5d51d410cb1bcdd5faa00c499035"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5504933816e3f65a3367e4c3df30af93409d91a51a07fea72952c0c2b912ddb6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "fe5b2bb47aea5d1a459a27feaec5d6841aedc0e1454be853379e77c3294b9f92"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "74bba79b3bb0a5cb0dcd960add60f7a182e2f69143fa1ead5145596ab41b9d1a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7d912db5785402fc999ec6302c16fbe50bf96a041e3c8d4174de1bbbf6e29f12"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "daeb37494d0a8c9aaa4e50fbdb5880c303ae21a44a38e1e2f5ba1009c13ff4c0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "27078d7b81fcd4c862cfc3ecf184e8cc4fe6d7467f235148535d3db6e3ab4321"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7a03c4bc4137e07694595cd034da0365e68097a86250a601e0c2d22454c16ca4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "75d019c99eb097d4778939e7de55ba503079b0c585c084ae0088c2a5f61d5b94"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "64209fbb4464feb6d3e4d400ee9311da78062f950b0b2abb91a65df0b9dfbfd1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "8c18ac93e2ca88576872b154449f0ad8f6f523e1e623fda0ae8c380d3e52fc22"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1d65e97194128eb72823b9a689d9ea43ae293aa37f55e6fd9918bfbd0146e344"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182872, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "99d43cff4d36039329e525774647b7bdbddaac69ef311c3dea1953b201a212f4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174168, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9e98a7d2820007ef2e0732f39cd484b2df6de90ca9d8713ffb1247c5ed8fedc8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "9f110d53b8e9b4a9df50d12ec69067f05f42d33514c93bf7dc9b430b74dcb8b3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "bd7e086cd0b8451b05020c229651323fcf72760711da382a07fa4f4256bf3155"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "3b62586d60518483c2625df27047682b048ba5da938b9ab78d060c4790354db6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149584, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7a58416a72bbfe8c96114c2eb70d5c1e8251b4a6e371777bfedf654b12d90da5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "d07dd1125d8b238b07ef9033c7203ddcc0be0b6fe086c61b4a363d1fdff31478"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140880, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "bd6ef25e1bdb4b9779f9887e88dbe05ac14d3eb0bae5c54aceff0e40a37b16a8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "faae3f46e871bf5605f8264bf013aa86fe844cbf0a28e426084f850df64c2059"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "23b5a7f28855bdd9d09c480bdf3c35ebd851309645a8695af4f01d11563c4a90"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "1ba44fa0511c50431f29110e72bdff1a5234f4acaaa5d845651307573198612e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "d35cb8f85bf0ec1f2ffcdef96d488b2d8e49cf3cf504d664460b3d715568c354"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "71fd2b4392e3ddc8799436f1aade61a61779a8e3c4eaeefbd1688f5f7f78694b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "5592a477099d15d8cbe5f5714465b8f681a07d45d05bfad809fe08899af5f2b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "8d05b475461409b425ea592f92761b43bb6de94f2dd07bb997466e9e53738c2e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "f6b0ca79987c432a80784cb6c0b6a79e6f6e65abd3a362d5811ed188338ee232"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ef96d8d81b6a8773b899bf49907ab6d012e6d9300ed55b3f15ed122bd1a99984"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "05684e78582d38451ebe1c642d8e111ec982c84994de249f6ae45049fa209349"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e03b72f119e36f9a093bef0da6cf84940fb250c914e06b14eaf10720626165d8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0e200d51092828f51ac5dbd08d48c89e2cfd48477b18bae73c86eaa2149ac9e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2b89864d9bf56db027c691363f45135066a9d4be0012eb00a1a88352afd4ba9d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "18f2f2a39d2c79f8c0e745cdbe50883e6906a3e70efc57f007934142979992df"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "5c8aa63e6b36b41a08a1c22c98578a2e0f67548ed794e42957375f199b6be67b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "de16824a845cb880bd6ec6717f55ec2019694a970266c0f0eba1ee9a5ab1dab7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "76b5be703dc1898c8720574bda6941fbc6fa12d975d587a1b422f5d6b60051f3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "a0f927372399b05a0cace12ab2d8f20bc2ba6cdb871939025c66bb17a48f6ffc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "421cb8a599bd669ade7704057eb36b76697b8e9c195b5b0bc333da5c8132a69e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "26d6b0e52ace63e41ce04f26faab394d3a4958a52a3552eab815c4a87ec38caf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "476c7ac0fa3e408c186df69145b77e22e826a45e11d420cec62ad95816dc97b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "679d2f06db880e8d872b70d9fcfed5a6f846e0eb9ea11354160bac6c4621f87f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f51f5de778e2d3be9a67f7893f85056236622b626e1fcf3f70659e79ac5eb652"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "616c9f85efcfc030907d9dfe94885a667543db0dafe0b6c1f47e70ee972665b4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "ac74c9c252a8f202ad8b9d1293560ac7e2aaa55d8093b605b898fd7047f26698"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0e6c5721458df4be8703192c3c5e3ea4b6f5d9ada2795537d14e7055aa2ce413"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "aa2e33ea8fd070963f458cf19edd49993f12bdd0e8e33e2968bcfe2a56f91053"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ae73bd7503bdbacbec935560b5b1ce9dd7a320c21cfc89a2380ff4a8627decbb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "c4c726b67f23cca7597601553e16215642a4a53afdef0f0df5ada8d7ccc53a5e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "2f651bd6506d0d847c3b92038091e0d9bee3879436c0048199ff006b32303634"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "9e30a289e95a99d00998f28eebad503717b236632c3ae6cc85ccb1f6a8000928"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4d2bb2bc80b76d9957488f7cb7b8dd472bafc79d09790b10aa8ffc32660995cc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194728, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "61d27c1e0bd331eb50bad951b70f7ee1858947d1ba20bf44f1546aa545e5b846"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "784447414701221300b034a3c8922f7e7ffb1b4f668e3c0b29b1829852b26353"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a70b88358a034c9bb9728eba5566f0074c01ee65d9cad919050498424d96a2ac"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "847f565a93771adfc68059f4b65c77c5c754ecc4d48fc3c93451e45a442bbd8c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "6ddba6cc046628cc6eb6f21f958068228bc7f35991253e69d6596aa7d8a698b2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 159904, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "9d3697ef5785403d7e3e8c2fb66f3252e9ffa713876db73efc5eb240732b3224"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "8445a6bf67dd7dc7cc5314e0370c89cb5ba1bb8767dbbdea19eec6b9d7b27a24"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154272, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "28b0941f58c1795ffd72af047959c013bb45f98819045f78767ec8b470a5f116"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "dc657891163f426f5d272c2b88300837a6e36b49dc530cc34ef592a6d86aa482"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "929180507d8c21a439ab30baf0d991d55846f4abc793efbcc4557134b027733d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fcc61dea00bd05e1a6c01e6b6b3a2739e590662c7f84a1c7399a81adca0584d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "7fe5ad2c1ae8e4579adf5cc8614c1ec27f9d5efb793c3dde12ecd61f092f9701"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "59ab40b6f4f72c87479ff2eef8ccd5a4df8851be321aa28eff86762fdbacb577"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a458e09c2fe1a2976ef14c46582759428c9af39938d4d3bf2f60baf9b09b8857"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "dd73fcb9b85cd985d5ee07af4324e482dc3e26f3adb5f3a9657571711b4a67c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "767c7ab673225e7c40602e03693c2867dbb70c22dd899d6e7155138396c3d7bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "34b65256cf4aa2e1c0abea62f5aad723dadec2f71fa999eacf44fd0ac7bd44a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "969f552593aadc0d272f23272c6ebf0883867686717d153345798bc773ec4130"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4e4a1966b0d42be5a7a39369a5f66cb6c821517ca1897b28cf1df1307f131387"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "7739317d1de126fe34fcedc88e023ccaa7f9d0ad80f5d6828c987d4a948e3cb0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "911cf744df4b75669be6c29192cd73134a98ec3898fdd83ad87959eb41c777aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "25eccbec59a4ef11e4f9cf44870f51c0f54e2ca0efa4aa5b60085b7788a63b58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "f748330ed978d30e198ab7e039dbcbecafbf258b46ae39ff5b754bccbe8ef5f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6caaa0daaa3a89a2972ef3588c4fbfe8705bf6b1147e6457976771513c079528"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "586ea54c82aecc15568a4478088669b85aa173fa753ff8f470c22847a2e98a99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d27a5e729305e22cae6b983902c9c477ad6a5e330538582106623f1142adf7ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2d0d401a424dbeb848eaf7169f820572cfc2f51759e9312b5c476407e99ab28f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "634812cc0bf77639858151c34db8e42ddc8af477866f80024c626c08a225042c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "716dd346bdc75cd255c996fed228929069979156c5b9668fed844b56a502cf91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "f26b594854de7b523ad23dbdb5a425bb7c1eb14133ded2aba375168fedba83b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "667acff03dee556d1914f273b2c25e0a81e8a43e7d6556fae6b3f69fe12b6d23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2a57ed946eec20ea94e40bf547fe18cef3a03593da59bf2a6be57bcecfc11239"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "dca73315dffb122eee7b971ebe7a5c24f42e4ec925a87bb7cab6d62865232fcb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b76abb2a5aa06b831d440d514f72d09f60e0ff422a26796de8d030adf2d95887"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "5b368deaba9341baadd05490961c6b3692b0f59459d9cdcfad66392320da3be2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "f54020eff0be71d46c7a42d6b34e33d096b65c43ff7352d3dc11367b782725c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5c1fd3885cd392e1a2640cb4aad478bf3b7dc20d24b3848bcfc0a3d911fda1a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "faa24e2035d517a0df40090b7b8461e3781069bf7a07bed8e0b39615b5972509"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "230962bcdb289dc3715559295673736a874bd074d674de37bda8a0e7bd1c1ef4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "edf58b47681ada425aca54aeb304c5566873f8e292e4972490ed87fa34bb9e59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "af8ed68903976fdbd11e0c348e55150abae89e1d682bf859b967a3a42c90e4c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e6c79476c37e63967607168630cefcee1ae89b73fa2986e14f223453c90c4e93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200824, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "1774a0d6fe5aafb9bd00abcec2ab2d892947541e7a2240a007c4899e18511f0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "000ef5e07ef5843ab68c8b6a2b620e7903977d6a5944de62ee19c8847eeb046b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "90aeb180b7fb75e2126bafcddb2ac6b45101be1eb85dabf972d79b4e3ab9b804"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "598db7ca77b8e9976182c98d66769599416764a6693e49820e5785e64660e53f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "42ef50891a1b8095efaf5484906c9ace9289857e25f1f13692598b5138ce554b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f1be3ed572c82f4aba067869db5b5d3e31119477ecd8264d40e1d6bed61757e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "206ac3abfdc415b7efa183d919cdacc4d4d7670e4d382c5f608aa875da402bf3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167024, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c379df5533678d4dce822816421777c13e3199a80acf62e2ceb0d9819f561dbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "d6b31c51d57fe0244ab9ae715f1ae70ed4a1be7f93ce3b0752e3c26633cd4e89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "6eb367aa30ee4a3c2fd046882bf63cc9795c4453df8a2f1965f3dbce596aeff7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "cec9ccf65005ea794fc625307f35c7f30c6cd9500c600c8dc913664bfe465a9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "633f53bf3ec35a0faaf472f8f0f0628364933ce47c4edec263e59ae3029b03c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e8311fad1266f28254365e12c8669c5384485c5fd7355e6cca6e7d10dd1f76df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1d29038bc03d8dbe073c14d25e1ae7618a6441a52438b23dad2667873737c4b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "018dbb96ad8b24f343403b8e88e73d1d36d3e8ba5e830a6dfc72145c641acf70"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1c59d688b959e97a8da273d994f9fe2eae1d6e253e88b9c9a574573308affa99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ce0166438ef7f70cfef409b644e1bb5891399cafd4511ab36261a40bcb09d6ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "68f3c6e7798ee974a8e48e4909ca8c1e8e88aa271d8019f9024a233c0e9a52dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f67f8968baa21f0324dc911f47ec0bc552235931b852fc08b82b2067a261fab0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "23b84dc67e7c265ea8e295f9e2192a314a463b6f5add916090cbcf60e0a40943"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "610e861b828b5f21d0f604f6a495b88b7b8e47046c818f854990b8f0bfa6089b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "828346cfebb3436860f8b9a7a0653bc566f02fe00ba91a59641af1d4e6e62d9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "1bb81fb65be6ddd18636ebc500da183948341c62b40375bf3941fe94f5168735"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bfd2821edca0b3cd10015d44857e4d083aaccd8c52989c25890f4d3df49f2267"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "472c10b43b5f12230e141cf5fe3ead8241b1b3b09bf29aff250dc4d3e8c3d7fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "a4be8aa7d10c7726e40a2ca6365e40d55b41c172adcb2d3d8b2cbecc09236048"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "37cb844aac630ac66256e0f22a8233bdb8ac33bbece66da0886a1d9a63efd0ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "641c9cbdecb093855de02cd06c290821a71d9f104deacbb84286e1e2a139e58b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d51da6e7742be64de3cbb4aa0031300b7c8c3f5b3b457567094f66226b77ba3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cfa017421202afcdef34ce49df9891f9c90b8c4b9d9df7d56a4047e8e9ccc30c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "55321c946e31e457685e09e703fb1617b167b972b91839b7063dc94a41bba0df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "01edc608c39b8009374582d8e60b3f7e9babd06eaeba5a189ff83633fed8aace"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "316f8380c3f569082db521bf894cae51e5e774ea9a5ee3a12261cf95cdd58626"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ef3be933e1dcbe0a91bec0f4530125c157a31cebcdfe5be72afb9e8f81511386"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7d3eb76d0e240d0e02e9ada4db2c7895141bb9df9bab1da5d9963b38a038edb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "ba8e949d68ce117f587b6a8150ba45dab61e74aaf7624c2914bc57a0c2b67164"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "64d4e91cd357ccc5266f5009c273bbb2e7247048bdffdc07ddc4c0b3756b0d74"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "e73b78b8bde77ce90a828912c10a8a139079181f67e4c878af765f31f055f8f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6ff514c284b9e639ccc9c1bb666be4136c86d618aaba653f0ad94d042e632d97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "84aa0df77ac668b4d05af656d02f60704bedb7e391124d3e0657ba3effa93ec3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "49699ccac6af75f458a7449e0a5dc6c4c6e14c9da9eb57efba18cc8fc9fedcad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "bbb835ef1969328da14d3f21bb923510f2aa685f8084eb56e9f961afa0a95eef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b1340d583c37df0cfca4e4559813b377f3f54b6e0237c8fd8c74499fd55cc632"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "5bcd218785909f10bc755b6bfcf9fb17043682eb36b783cc953ecc78a7001ba8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213624, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "0b816e03d9610b6af85442f5fd71dd903d8da6fa8f555447aea7a7f9abea5f0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "59a00a340e4c1ee937ce23e1c423429ac481052811b6072d7006600064fa9d62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "640185e57bc593659ad8ac63d4fcc3cd9a3a56158f05ceb6d79c22cc0f4799ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "b913719e84cddc5ace9cc8165dc62068329e2628b810511a74dd4ffced38c551"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6cbe9ac327585cdf3a3b63f2703c5f6bba692062bee46e9499b8123b648d88ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f2f2d225a7f3b17754d04559a554c97e4156150d592f5eddc93a640c658f451a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "786d9bc7264812a2296095d4429310d95b243db7a13ee81ed93e45d0eb353217"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180336, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "cda79cd2db7fc931a80975a9999ad30ff926190bc711fcaa7ede70684e1c8829"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7becec11aecb9561b0df1834952ef7f97d6438ebd2484096c80b3819dd3d9bf1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "24eedcefdc77dfb7b358c992a6693af492a32e267fb9104f3bbf575b61b2735d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "96249585872dd5a15c00c72a6f27bb658bcabb842f8dc416d445abecdd2dbe48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ff7dc268072fc94c9382802e5d91e5f6267a91a3819246a8c819d1765ab76b7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e8bf8bc95b65748a194c718f191a85a69226f35e7edd4d92a006cfa88f1514fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6b26baa69f0db4c8f58f2adc7f5d6d27d3da699ab527547e59efece981142480"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "d491f05c6e3842ca22fa7b34ef7df002c92649016905bb7d5bf70ea337bc489b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7a39236b7471ee083860aeb37a4d816295f6bb7076bff1e61c7093af61716b1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "eb96756caf8166f3b9013c26ec86e810df2de587ad1bfd23a1251c69bc03e61d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f0c0ef8ab00b29760588ba440c6f6a41d73c47a972c2c83194a3b93047a676c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e7b65106ea5db914779532c4e612dee83dddd7680eef49bbff4298542dea9ecb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "42aa6678970323a6c5bd09d1cf0619f3cf06ecbd7362af3e23df753f08c4086b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "56429a48c1770f605ba6ff220bb67488aa86e53d7e91a714b6958829ec085c58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "85545f19d534b3652c00e2dd32c8aaa286665269dab875c6890ea8e7e06d186b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "4436364384a32c4aa9b574a8e72035f5752451c36940745364219466e66a5644"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "944670eaa18656ba58083b89abde13eff26f21cb1727ea30456d78ad90e655dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4592dec90eed34c701a7f8115e01aeab0652cb74513af96d9f3de07e1a84d750"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "75d2e367d682cca150982d60f95389350fa1de824e5a44b478341c6a03136be5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "eab7d8c7ed8805ce1bf1e4c570a5c889dacf602b48df4a9853713c278548147a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "018c232103f63037056ae2076b97476c01caf3c3ea7ed0aa427f533e1f5dfcca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e2316122c4390eebdc63e2c3a80ddb683b3312925d0d1014c9e1ef34eb523e67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "debfadee9aba3cc95103a8ff009d6ae5cdb39a0dfe4964e2180824ff8708376d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d0e0f44f93086ec5a4d7d241aec81a0b40529db6c3a4442ddfaf4357d6883a04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3305309d2b24c8765aa824795253cba8d5dea0a9fe3d382fa8c906fc304f7926"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8c981a860c3a8be3e3df5814728ac938dfe74e548c9c34a9205696db02712cfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f12b59391f5d686c218b56d1945fd565a8a7a5d00c2fded5675cad324a641958"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1591a3b4cb1b3ecaa750022e7f92d874fe30f07c224176f0294fd0d8391550ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0c2660eb6c6945d3203babb454863f7112d44e0f62bd0928aa8c8972b1101451"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "7eadda9ec8f484dc71ffd579bee89a3b692e1316e02bfccba95427a22e412ce2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "0fdd5ca8bbe3b416775ed0602e17a192505dde1f68974404363ae48f9f866fb7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f3760397ebaff484155112c2e29ebc21940e3a2c86da7b706b80dc2f51aa01bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "ed628c99e48a4d91927358f122a318a8ad8eb96c60e905fec4f6ad0624463b82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bcc1cb67d3fbbe0836512a1361722c27696683d7202aa39db3208be9ca9c7e3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "70a698a0e1fd55eb7ae29b4ee4cee367e0be88f5a9fa6d3dc99ba65073e214a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "8afd36e51794fead0587260b7ad64a2d5e53b9e93f7104ad15da031d67f9ba8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "41f4ce66131acc7fd6b69b41cf08a877102bc6cd8a88c67dea6a217f656b1a98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "25a73e1d6d5863fba9491cc47c64db15ffaea82c9e9a478e7809b5e1af2e3af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "60f8fd1f8637f93f7dfaa1bfe290eeb9c2d83bf9e88fe9e14b8a3bbfb9b73a4b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f413b0c843b661657f2dde791104c02e01d089b64d1c2325055a01ebdf96a06e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "178b8c0775d9a4471750b8d4712dac3a54a577ffd8fffa6dc19a7ca41a432c73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "297373a590710f53ec2ac9e84ca7fced4abcd519f5daaf4f94170940b49d897f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "76305f32b95132ce498c5ac1059a68892308dccaffd1f4f96d94092d81cdf9b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "cd41fc09bae5a999fb2205437093c135def419018acc1d4d3f5f62bd18f3a928"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "a6fa67752a06c1a05eb53fd1ad3400a6085290095fde02b1e5ea1e6695111421"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "db0ba3cea69417ec06b24019f1ae36a775f66e0680576b9e0f01ae47da89d511"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a88886a4f8d324308bc946709be200b5b7d863bdad76ac6742eb443027fb6e79"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ed0fd1a0e92d2fd05dd62cdf661550b5271d02ead35b91e05e65de051adc21e8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3114d60f63ec476e29a47bcf27c498881387fad146d2f0c631a06ba8bbbdc73a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "03aa7dc830f6b820223356de34d12ec7aafaf9a0e184e0d99a5a8b923aa1f4a8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d7376e058a122d1c8d954ad02507fd55f131e8018d369358f2f69d25035ecda5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9e793334c59e5592a8b88ba6527385c1aa98c64c109dd4ce725db9abb2272be3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ed2c3ccf6cbfe1976c3b633008053c315c7d5c5bd0781127d9081e8fb0ac9106"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5864d64b853f2d82bebefbef10b23c4f66e0e6c214e5e9b1edafccd3e00ca7e0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "cd4f6fc784db62948fe0bb0450ba5ba2dcb5ec9b9d144c471f4bb8913dcc014b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "76b84ef7a9ff20648d5d45579982e36160f59d288711117ca557fbc4fd22a1d3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c6ca7b460b0f8f6b1cbb1f37ca65f72234af7494ecc48b031b2add659ca3863e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "636a85cf0e3d4b3a670eae26e4dbf6011b90ae0e15359930932daf74d2017eca"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "eec694d468766afa3bf2d9515cb51daa009afee1f7c83a154ffa804544a0e7c3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ba712dc7f445f2108e0e9a3e29c43d9e654e86f89619314f1c141dd481258c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2ea339c2cb4e49d890edffcdd4103381dac29677645c7cd4d1eb9a33d91c3e3e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a2b70ddcf959026c70417a88a1ff74e0934f803ceefe208c9dc5951e99120d5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "950f3efcf2d1a4dfab705ec52188276b3d3ea047eadcaf7be6cea64fac51b7bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "066a92a0510b5806f5cf6f4a4155cd949a953e3a14822644143e48e4293409f7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e7e26dae1d0cba5b972986967b96ac8e6eaa079aca1aacff86b9679e869a6541"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0eda90bd79312d23ff7c8d51cf9e061cedf8dde7c658485af096e659db49ef94"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "fcbe9840f818a4a0f0d86bdce85271ed40492ddb7e344ff4c90a199eea0d6f99"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2f72901b9fe4c17ff79f8dedf1b35eac14adf6dd8603a139103f7ca9c7c6d289"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "89ffbd820ec67f329be9442b02f015806b021e92796af0b8233b731e7cc46ff9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "fdddf33be68342c12c05d5a47648a76fd7c0185cd41ccc4387c55c6ba861e2e6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "854b7f7ef1e0ac926e1836f266ea326e07a148e3b18ace39eb8f9cbb1f9c7bae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a86252df5c0f6cb05779e0470ab81066e55e4ceb994e08561c9609dddc2f62a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "cd5d40dbc4b6edcbb43e8fffa2841e34d93428ec87d25b90280e2982cef07305"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "44a97a5be71ff103f13b99770eb0275bcfefe547c76841164bf6db0768675cb3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8d29348312494ff327c43a5e7ed7e7dfeb6ac22a76d19b0ed7adc3747f660114"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b2aa71c8ef1ed3c4230b3984539a0dabedac5120a31fe6b04184eb1d3d5529eb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c0792f2abda3d8de896f15697b1ce82c6c0f949cf0e37616d0d5b494d87fb625"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a9ca3b74c9b32daf6827275e57cbed85ed675f906709300c5d0637d3a5ea7a7b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d59677ad28917c353f594f59be535379104e4b8ce3b6466e6a4285bddadbd97b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5fa98c592e785c9d5dec3151e5cd7da46eeb4b12b08e6f0989c43acc380c766c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "ff77f8e2a7c91418dfc8f5b9e8ec711f8ccf670c1369b81db3879508811964ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "76c65c651f9336ef60ef26406d8ef1dde0a55a875b01b669873a778522c690be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "2bd50d7cceea41a4140d89880daedd66f02c9a3d32cff0002e269961b9c73db0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "3a06a61f79246265f820a0f61780f0981dc19f624b71140f3c74e386faafa439"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ffd5e3b92cfe6a7c81c027545fbf4b028faf2c8851ed48f79655218f4d9a7940"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "536a4f7e6a3d1ccf8b12a453e35ea987fcf2fbffe7a9d0c9553c020c56053a68"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "e4dbfcda778e32514ee0df7c533ef2e6a2ace9a1eb81052c447aa97839a28da7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "af72b9b70fce987f55c61954f8b1d802a76fb742716523f7276af7affce76ffa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9899441b1f84be718038cb7b25972cb11b720178db1021add1c79068c8d7069d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8e1c7ba3059b060400a9e02cb38a262888df668ffc406058f699df3cfa683e82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "d1ad38c06ad6d4366ecc08375a3acaada3ba02eee7eabf3a9baa60d1c61e611a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ea54218f666076553951a60090dddd3de06748a0380d4518bbc58987d1221a0f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6fce2b3a691b011e3ec467d51fd55f51f6125b4e614141b012e1a987c7480f08"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fb15e1d7d950205371e0e235c9b9ca8ad593430990e0004e3e3d65e4f4a750f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "626520cfb16c565e349fed6d35e5b94fbaafe8b02b463fd0ecedcd4b9a3e50da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "862efd32c906fd35bc3a8b6954d2299babd1517a89d69c839a6d9c0da7d1f7b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "adad01d4ed17a2953917f005a54f5110181d7c12d0a7f58865c66ab115b0be2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "14a98e8f8ee2d54d3dd645c5695fa32e707b7d0e934a66e44535ff29d10c12c1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "24bf7caf636cc88004985a2f2b7f1529bbc74566a69d9636248e5bab755ab0cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d277e3ea2dbcf487a5b2d5b8f69f2ea6e09ddae8e0c3d068817d685ee2508a62"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "24f5b7205732e0eca335fe58b81bb21a2fee018fc62c15f4f5589dfef38b93c0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ee0656fcf86a8a7ee0b4d27bc657cc425bd771191fbc3521965476b5ecb63f80"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ae960d0c3470ff6e9cf74a8cd9531d2c31384d986964b6ca86bc4ecf204aaaa5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f0696ff39cba9d6b7c85d889b11c2d6fa3913f8d8c17a53b80b3a470b2bc8d64"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ad20ce6c0472cc4e1a0b5b6fb2a0b16cb7a958b002a590f455f40b34f223545c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "04c8d6197ae0eed1db9b456368161982e6fe9c3ed168a84f29810277c6d66639"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ba8f4ba981a0d020decd86e1f5b903c3adbe8a42dce3181a85eb4d55d857910e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f13a5c13827c4d6881dd961a4dc1d59e4687311cca4829fd599bb9630832d1c1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fc8f595f85aad17dcc156520c857dcb7cb4e926fa5d4a4819a2f278382b9adb2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "69e36aaafcefe25b68f90f71b3c66745f953912d6083f62eb87bff8fc05ff297"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "fcfdfd8ee1b65df4cc99fff29576e33166201c83b296f6c3a1a061681086b7a7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "85eb314552c299e8dce1d1a8b170722f7c58fc37295a9cc85a5d71d8cbd78d64"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cca32972de053329de72567cd977a47cf6a7be7fd58dc35829c78b7fd689ca86"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4d55f04aea92597f45392722e2d9af252fb06943645371803746b6cf2104eddc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f2bfd37b73c64be475813e01757db0a312e6ddfa3036f8364c562d57224be082"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "be1d2500df5b8d3759d2e0108118fd0f9145c90f400bbbced2c5da7756b62c25"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d62ab7218102f427afe08e018b0d9572d9ea405663f55bfaf4b87b179ad1bd9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a90695355cff072c9f009926c69145f16795e51356cd44e9d4090285d4c3e936"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f9f5b6e27b925bfb5eae6164b1f251df1dd7632e4f0e64ed17b2225b6ef3cf38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f0d770e2db835c045b9c61bf587d038f02562671e258479c6538ab23f0381484"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c347f184fccc0ab203ef8d566189893277de1aa07b4716b11866329849751590"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9d23fae7831f1f26bf97075285b4c7337b5e13d9146399a21b9719abdb9d64e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a43290d5f4ebefd822482a907fffcf3878f37bb5c8729051d0d4569f581070c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "fe2eb45a452473f5cc35abbc9cfad6c06c03ca43310191c85afb7089126923fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7d9a1982d4291e80579132ac36bda6209f005bb650829a32aeb616d01cb92ddf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "13a75c8ad09245eae45ec5cbdecf66514f18fded0bdea2cbada6e232fe0b16e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "21f13470191e87709a55cacf9db81137ced8f0e020c4db6a3454e2303ca61e31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ac49388d17e9421e7edf614235d79b7652192e79b0feed16d80dabb46a656a04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b7bfa5dccd651f00e0d9e1f894999867c4c08bee727277ba41cf804afde80d6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "afca4fb0bd2eb26c0897816d13cb9da453d8472160df5795449a6dba54e48985"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "36816087bd314d42f851b15cf48e2849141f10bd77f631c44252a1b365630dab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3269fbe16c212fdb1158fda1c8abbc67fe68144a92ac0552e63ff227f6899392"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b319d268dd377f1e436ad885dd63d8523609feaf3e69e8b4fa39cdd230683723"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7c8f1a3442a3c5e1c8c3e20aab9c208e818901f2fc5e7eaabb57a7eee97dbf1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "89607d87ebdc8e006989e732c7f029db222a2dd6776db48170ff4ada21e110bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6a84c65f2b14dd9ead65b4113a74f709f4ca764c808ca345d9d5946531e37247"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "8a9268670ba8adb9d27ac36d7c3f1c6b8c974469c564be0438b974b4ccf5cca4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a02c03f0142fca4e2377358efe4c562278e7a19ed0d91bd41ad5f80d27a96e53"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "86abad0abaa846585a6a3bd96176a8ecf37e6f017f13c4540cb2a3d4df00917e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "fb0aca7e8a380e827c69d9d405354b1f18ab8519abb2ae4ada024ad6b419e766"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3a9ed9a4e0922ec792ebac56fb5c6e3b0f45d8db7bf3c674f9a39d4946f04663"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ba2a5be8dd6915c18ca3000604be5023bac3e78487bf475d9befe81058f1a478"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "d975944fc7c978737c636fb8338dc5eb8dc8f47d1077013abb0538d56a437ec4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "160240b0c02da2b1a82f63a8e8a740bbd7738ee3e3d317fea3f7de7d9ece4577"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "52d02cd28446bd5eebdf676146cba47bb5ae6fe6c250da10eaf8cd74945d3862"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "144460e08a9a51ea6a4899daa6a6bddc25c8387bfde068f722f03496fa70f815"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b41a9f3b7f0e06d06ee1c22f509075050eeeb74ff81dcd3178fdeb8217db3032"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "dca4d4b23f391f70d6f99fa7a7dcaa5ebc2ea3a9e8def8318893d1575157645f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8d3ab2061c2fd804aa5baaee4cf19bc102e6d30b7c0bb09ee1e151f16a91edbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "871d517d34f9db01f974f3731bdbc8879097c78b30abd66e958e59adbf6b9bca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "bb4e33860a17c1ee421d6f36e46e24ead8e4f52d81dc7b845628c90f44e98e9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "143b2b19174cabb96e121fd5a2ba3aac08a1ec4edb2ef9b6f17d167d83214d19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e40f1ef4531980d6013f4f67f2647cb6fbe532ba5d16da071f19ca10ad3a3589"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e86160c615e0163448d29ccec2e846c3791979678bff88a0041d06c8024ff943"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "02333632dbad5627db0ef3eddbb5cf3d1cdb560db6c487ee966af8b82a2d04ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "72c6d6feda7ea5080eef5e52ef422ee35dfc8b3ba93eda2efa569a97ae7c5dce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "09ef9bca859603d8edbd1ff1fe4b462f41703b025f2eaa3255f368ffdfbeb8c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6f2ee1f392f0948aa86df153aba8ddcd44519f1b01d08624eddb21ed73163ca9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ce97cad8069752f70c595ea01cc8fc1f5e294284b761972153afd3e683e2d499"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9e12ad7452b72a62654d29489e3d829232a901d508a888bc2a5a3399976b74c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1cd7bfeb753dd4893294acaa7f1adabd61ea0159a05aacf651c52326b5d92fff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5b47e283c8cdf155597950737b0e498e9dd49db25a015b08b76fcbeaf5e6f8a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "341d859c9c43710cf44947ea4384b6949cdfec2a8c400e8cf9c6f59a2004c230"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d181f58be9e6b31388a3fe459e686798f36a63d773312d98e3a7b42872321f8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "12ae2be2bb45bc476eba7aa3255fa1eff2054ee1782457df1f8ec4960e45823a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a111fcd3da5c362556e5bb3e786a448e362f59a62abf9595d3e92a2251c0a060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7c7224543bd3cd2f4fcbf64c80eb7150bcdf4ce4a923b6e51fe70b29545f2ef8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d46b7ca47ce2061c15a5623c5445d9fa3d4d92cc6f442bc15f1618752f7b1738"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "edef9379caf3b2a6e1a71beedf18e5694dc48e308c57401f0587de908e2676bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b54c2f8f7f3787fbccf9b595e6703a26e50e908740c41b504cc6079a79522a43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d48b58a58de8dce0461cb211400611645f2dbd3a0c539d9b9fef0140b6198849"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "aee5dc87310f441f2512fb5351527e6b4bdbce4a3095fc269656441dc8e566f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "bf1a712126388a5632bcc950512ef67b1000ac9600efb7dd4d9b323bae569cd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8da95818634f7423fd0690c29a26d64c2d915095b3b0390416066485eee6f210"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "44e0a615a03d6ec4dd3794970096d3f6defb359fa9d7325e28df00e6c01fcaa9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "3050dd6787b2cab9094af89e7b32bd9ba4db7603a266398144fd2eebb8565a33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2103a29cbb17610d104e4f46cb9928b411aa5a01a055a48fdfc1bcf8432bdffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "13a5614bdc309244d3f6547ae622ae2e1ce43cc2538283b9cd8781db7312098a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c112c35c42abf9412a2499e2dd31afc5edd3a68d030c725b2a63c1c747e5cb40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3d6e5418f0a55f580a389c023832a8477be75588e50e4ab768826a4088e910b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "06a4537fffa8d351d2c7fd411812dbee864639940241c5697314af8d12a8e297"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "bfa56c35c028234888ccc072b61599775b4c604d1f7d491f7c6cbb9e97d497b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "709d074e5d837bc1ac7419fa35d290c7e7c0161876f6653524322ba3ff5049de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2ff4573255df60c40ba406e377a7bcbbbc77e2dc2f20b829385a6db4a67fc79b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "482d08ef46af0b81d50913a643c757d2639f483e140030262f8cc0f620b4410a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6f52ab94009706ce6c5129cf7aa60eccc89dee10a7557ba7c94563bfe8a68c2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "474acc308fad37d158ca39c8663aca2f5c8276819ca88e366ca64dbc67fb4304"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7827f861a067978a4b5561bf37e1b13999b5a20a72eb5ab0f57e68b7f195ef8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6b10e1b9139da90c1b9b6bb5cf86ba0fe6882e6e03cf1816af2ee43fbb6727c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "236fe1d995d3341d5a2ea9b4972d4bea47e643cd78a65c8c443a314f3e256379"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d5c07647d8383eb409cf2d5c77dc095f6444458aa109d05086d233f0547af3ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c813f3d27fb2d68eb36cff8b503575cd81a7998a249a4c24cda8af12be0317ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "35e255a9def3f7b0301ac1689cd10f71a36f125b43c941040bd269a15e9d4a6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "117721a2131f90ee8e38d395c7d9089a6e6654363e75f9791924e665251ef5d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "731c2bba7ddec6ea48dd7cf34873b773e055f3e99f0d44e80397c77679520255"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d57dd892ec549c3a6365da7272cc1e3da8b60c0bacd5940998666e080e5b9abc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "aca4065f41e75408a9ad72627f81f16411b6767e25c31d6ff9d5be033d4f5bc3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d28f363976c380d52b0907c82f21f74d47cfb6b497ef4c13b7c884309967b5f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a2ecdcf9992535157c496cc04f5a7cb3bd0c2fcca6cc34ee2b52f24775bd56f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b913cf4a5b190580fa510af31e8240e3568a857b3d80c29bd0ec0b16934775ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1276472835b9dc1b612663b51a311cd64649b876a4e9dbed6c8590555714ef9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5b706f3d0af02fe301f23e64216c0ecb5157891ba5d9575029a597d85e331cc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "14f3af5cc1f6b6636aa898faab78dde7a23c9196bfb5b38689a7da8581148414"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "745ec5b0ab8fbae957618a72d1150451adc12727abf48db57c053322a91b39c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4741db5519ad9c874b4d63fe5631424077bc3a9482d3da2f79e465f11248895e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "74707ec90ddf9eb55cff42110f8c1b4e02b2fba1c3a992b12ec16b511d4d2821"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "0289c96f894fb38747de0bbf736af8d3a7fe968cd8f6dc0586172cef065b10aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8cd9712c35e9d4b57ea6f3fb42889d9e64e6fe8daaf56e13e9b8b623c2ce3483"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "792993a8c1430f4526ddb5b76ce4157b8a476595551fe939983bf7c560984d11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "64c800f03f874c0be6053725181f0ff9039862aa58ce733b6cb2112068547a9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "933bfd0a8077a85ac36790c1357ce48915c638e20170c5cd8e27a67040acec42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "09d814127130ace88e0fdfbcab202979034c6015522fe6203d824e312b275dbb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5630a65b0a558e7519c87c959fde1df26181c819c583897ce46de3c3a0223def"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8fe17a54bdf442922e7c580dbe02cb5e3ecd8eb4a7a97e74ffb99f31ad99506c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1d00202b74001e748b51e51ba58f4f0d807a2140badea4973f92b0915a6301e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e54cdbdfb65168d8c2dd68f30b9fbddd8a9ab5ff07566721a24fecad681dca42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7f6f8fcb538fc69c2265cd08b2569b81728ab518001726eb659248ddba4d5e21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "40198a3ae95d17da3d251b3f064a80f96cf9a6861c205e1e09737fbf0390454c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6eaae4978b1d108ebe9baae124288b2f302771196b00e5663169e524f35e9396"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b88c9b382f037e52f0c6842322b6c07bdd9923dbd74aaeee551fb8f338818610"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a46da4414ed97b5728e78f0fc62b5a6a3685804975143dee0dbe19aeaf9085b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4fc7092c502e1888ab3ebd6c43629b0a881f0ab6617fe52d1056fd8f75ba9998"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c340ff289c81ef149fb126486e8db0d9ad27a100bea3ea1a6ed842de3fded316"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e9c11b029e5d99755c6777c085751087da8ebd4ba16bccfe298598bff9c32c14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "130435cd9915c686dbe252ce41c4f02b71b6baf20ec3271774ed69ee85ed9b29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e0e0a1f3099aa1997dead4181aa74d4884d5db6e0e856175760162879565da30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f29ba2be159d1201f065160f017c71d64bd26c39b02b144c80a96c5c4136b439"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "dcb1782961d4c02ead14644029f665f11c3f73e1e70f186a17c194f407e812f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "751e253943c85ce9b10b4df98927147ae30ab944f756e91a4dd5a7f02e4e358e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "707392bfe802fe36f7a8bde4f46721e0ef733ffb5964dc78b7d336f5dbf5904c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4f367ff048b03ca43055ed88421ca3d9fe193b601719b9147fa456e803f8b789"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "30d6aaeaccc029c5843144632cd61a0f48821b4397a895d1c951595540090967"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "627d90630f2ddf978e117a40dce0996a379235e832a345675cfc2f745b3698a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "02a7300637e5f923c5b3b5846bfdf03a5b27f63ee1fdf83e33a3701acea646a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5b24730d8b6e106e68e11265019e1e231c4e9075dbb158c180daa931d338fc96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "38a80556eae4e2fd25dad3696aadb4ee80c85642de3587e494bc3b2b5b43a51c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4b5c46edf013965731e5157c3811fa0d595c8109d4e5d49f47ff4e8e46a3a5c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f8710aacc0424cb748ea4c2f49e5373c8cb6ae0c6f652d5a05ff1533090be757"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "8c9d134245deb3189ff73b03232d6883dceb094db3cf6a4bcd7a3005d2224c23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "26686a89bccd749b00e6923341b29d73a9e0f9dc7ca484e3495c0a64453ae67b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bb6ee5952722a6692e8a08bf8322c74ed4724a261185f797b2e9d5c61c00f403"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "477ac564604a8f836e12fa39c2a985abb7d8d9bd52ba384e9423771ac60c3963"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ae1fe3d39fbb9ecb21041f147809200caab9b79be5ded21e775fd066a06b4a8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4b475d1817d9bc99065ca8fa8f6c7ef652153607401f0ef58edb0cb4cad44b68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "bdb0c09404b2e5dfa9dacf060cc9607f522153c1e0beb7da060b3372d2530ed5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "047682e06c591e3757b46da355c87352e5a43994acbb8fe52110bc0f397b5df5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d629a4991fbeae28f2b4d9db8a8fcf0ec08e0a4047e331ac27970db2fcb87347"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "903fd411f56802b8ecfff5b2d6497162a97914decb1d09bf52c24162b3956342"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "618a21c61105d84b91b93cb62a9362db4050be3d1aca0b7cf16d23369018ed9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8838d1f0c14826f593046d0f4e078c668215acb2cb928bd3382b303884e3aefb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "58e1e9036493ab85ef32dd16a45f0168ad1a7c4ebee64ef34a0ee583ef0f3c77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "332b461a4f1b6d8807a5c0e3a01966519d37535e94928ba93fd3beeac8f18b4b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c37bae247e326a0c5ea8b614cfd17eed2d5e4d9872ac05982b3997e174dd33d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "146d90cf5d8964c19d0d67cb34466d7e4c340c9ad1a1c46f60a0d0ac9307f604"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4808a5c8f660379fd7513c9fa7bf8354b8a646b32da580597cdd3c5cc7cae76c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "829f80b04a413c62e38078b58a787fcbcc39eeb5171f751f842152badbd4ecb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "437006b8d5f716843cc3b3b68183a72f3925f66b2735eb71613d0bcc62c05b46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "89ea15389d4a54a125b4fdf32ae280d636675924ac4f5973f1ba6482b43892a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f796a99d12a0d9beb643c0a30aedb98efdf0bbbe0bc58eb2426da830d9729ce9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8dec6f5ff7de93c70508889d3fcc28f01efea44dbebfe00615a4588fe9add845"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7abba4dcadfdb75709e9dedaac3041ebcfccf38edd22aea10cec6148f4def447"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d6b434e85fed7cd4b98ba712de1a218972de1e4bbce49fe0993bb1cfbbbc08c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ad0c49a5eb97056b20047487e7ff216866977395e57a69a2d431844d6280f06a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "501f95bfb02cef240abbe1eb5afcc6b21ac1b9c24e03f25080d898c7772add59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "37500b54d656af20479a8ab139b01b6f0dd7305b3249364451017d2dc8880ad6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f29f20cb056eff4fbf49829fc8374867e636809f59cfa2bf6bf3799d8abeb42d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b6aa75fbd19c9740aeb3d6e6100862c5bd6578d5b719ce7c14d553a7176dc672"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f671b22d3b2ad25cf5a7f7e2fa181a1aab75a1fe32f1b53d96c6824a5b152847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ae227e5efae6230626ca824a0e4388eafb1c9dd30d6299279b97d28a5cc9206b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "87ae0711ff69ece781d5594acae7dd316d5249a221f83ff1308cae3c417af4a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "05a61b7a5fb4a9b022e81aaa5733f97d350c85743f82ec46a63a5a758d7b1a6b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "561b7bb1ad405b3cf6d711e27fc8b564da57c5edc69734de23389ca750199267"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "963c85d0da8183df8fc97944391eedbd1879b9a3b165e654e848ea755523ca99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "fc4bd1104a5f0f1c26d7cc515ac496161b15ea99834d88591608ab77cee2bb66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3d32a221858d29da44d2dcdb9b7e8ccdcef96a84e1c68cc1c5714007b9102f3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0360b11421fbed7030cba0d0b454a6253fc2472ce11e4b37afbc672f53f58be1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b259a6736f2d59a09e093d0b19144d2aa20e74f6f246cf36d60628fe52a5f448"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "37ae8a29c762ef66de387adc67acf244ba36d63e3e1d12d5b96744dc7bfd1182"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5d944f68279e906865b100518e7f3f4e8dea987e97047ce22c42692abdc5d08d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "53144259c0ce10ce75e2ece8169d2eedb676079fcfc4959c7384800093ed413a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "baaed66d5d6cda74b0827ff5bf5c4e71d0acd3cde798b33f3ddbe87ebd2b42a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "cad47dac6a21a8160cd0fb0200cdad9abb10c9bdc5d5795a0f36a77908f416d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b808781995b4e4ea483f5d95d3b138a0be20ca9a9479871ce2f7b36443792b9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f53b2c4d37c6f2f482329a7fb9b0cddf43c358db8e66875e87cb1fb9512b16d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b013c3ff659892e5fb6ed3d5f807499dc26887595b4ed138aec09ff65e051a5c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "99bdc197a4899dc65a08a0ed1a53bd5f09b5541fbc07d1b9bd5fcd1377a412ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "7c2b7c49dfa8e0ad59085a7476b68e45105467611071902c8aa53da2bc317997"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "395ff2a1502f25c0b4bf29bf98c22ab99c11dda42dd2d4c46164e51d3d016845"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f3e44c0ee625a6b3350836ef43c3d1746434ed0ee8d979d23cb0215e3b6de067"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "bd0b59149e9532a7c9bb525ae37d8728056a6753fb6bff25fca14cd35065ed42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "efc1cd627f7b015132c5e4acc0463f4355888afaa0d53b5dbc76b1525825a972"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7a65d4c9edfb8ba1f0931b8dfe108f4fb8d298a491c9f76751d264d093fc60db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a4f7e6cacfeb9077148ef8f0b79bac02140d393b2d3d8f047225ad7bea1773cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "35c71c9fd8e13038ed16523707317cc9b62f7cdeffa64101eff218c167690cc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "6459ed8b8161bea595d386d55bd227a1c7d473fb00db1738e7c14e1c1f7c6435"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a60b7a5f613772bda5b87e44432b8c31d7e9821233fb3bd3e392dfef8fa0a226"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "446a0c6e79ce765ec5a8bf1537521cb74e6a423a79d365a1bac8ee0589072a70"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c12d6355c968b2e72355522b67ad6cb818937e7a5f2efe4f7b21389d04f9d7d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7c18d5fe49f6b0edab8b1170394b74d30017e95e64db730327ea601b1f8cbc23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9b7a07c74b1b0e7a683622bb19c337894a216de5414befbb30f5bf7e48aaa381"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "10314bc01f562bfbd0123ea6d2f40b5bbb238c1ce257cf9ed14bb6ad03dad241"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "535ffa8a58021b488ad4f8a3f2def76d971b3f61655ab5bcb493b29f25c62b0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "9c630f96b1e94edd8ceabd26c58f49275e4fb5414eeb3c0d651be8abc6efb347"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0d1b8cf57d88eeee23dd95a8e2afaf9ad67bc93af31ce9703a28622d69d158c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f25b4c977d6dc7d0220f57bc15ca8112811715a8638f93b8b5d2dad9674b82f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a6494c8131cc00f6acf87fe59c78266a3c1ab07323829fd821c3f936701a177e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "beb888b377afeefdf49b650d882628d6c34c9a29c763e295e9c93167cf57ce61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f1ed7c827473793bc64d80cd1bc95d3f0c2103cd07d5e79703c5459d8362e9e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9314d50aa62cea7617cf877a97e095e28186523c64bc0cece4e86d594010ded2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "56f22963b5d9dc695b1ed3fd603c7bdeae94130094a482e73df3b848dd27ced5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "8a08f4a98a67f08f9ea159a47c3b8e6471815169553484ee3e50b79c133caf25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "588665dfa9d196e572076b5f7638760f43568d376ddd89814638c821ffad68d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "760cd0d20045a10d6f645de721977f9b806be3e07baf1f5957fad961feaf6971"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7656668fb14b7dc662ab9a06820e64f6091385bffce95b55143747a7d85d8776"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "74fa8831aac468b9d7b9130891d270ddce8362f4683f69805548d4d05ccf3124"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a9711f6175d143d3232010e9a34899d2092054479c7226fd8568e5fb1c4580f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f4dc956e2ddcceb01726f64117f03b5cbb78304b0ac609c7505f3d2fad6b0a53"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "7e1b22d82f1d27484a9ca8ece80a81431d1287d75a16b6e83f5fd97b224740b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "60c0dbb0ca5b158c9039bb8b3a0f4e1c4e82a8f5f4e96cfa53519fd11d1f3514"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "65a9429c6a396c4e0f13b5c97d282499cb249a06854096e4b38f4768973fe1a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3331c6b9b44ce26baa7121bf04f12606ab3508c3bb5e2721d422b74dc8aa2a5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6e500858dfbbc7f6561c62828d84a966c6aa7b8373936f1e3b786600e6386f2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "85ae99d5a5db0467ee6a10ed834208e5ee9c1a5f9f175b7c77c9a36007574f33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e57b1513b4ff073318e85c576fb4899fcdd74ab0a5eaa680c3834592cb0ed8e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "0a15f941907fb558d2998556b3f12831cc64ed4c2260ee9f12603c902fdbc8a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "add1cbeaf5fad843c9a09aa7f67708a9b3d3886565e6be98d5ed0c082d0381bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7f3a764ee8bd8bafb2a8a133b22f1dfa5bffe2568f81a9afb0e8a04746efb4dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "4d5d329fd7e30f666e44823c8d49ea18c7f4b59765399bcc780b1ad12ab93495"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "212dbf17273d8702a7d60ee6bc5c60c2b3c3208ec43611fae77858f2c8f51e56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d7be62d23c9e16ce13075e03e1a0e91a902a9e68c16d7dbea8847817e2b23fed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0f3fb4bfe8f8f3090ce9fc998f225f1be2d3f22690159ca2a74a027340a16f9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f5250f811a37bdc02ff4950ed74a0f7f9a6f8ce521996a1e02051c6a2bec1561"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "234988b52d20f741742b54b5e9ab1395934de0f7bc8f984ae30fc047ee4d47c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0deb47c50fba4935bd52be88579953b5af08c087fb4169108dc95ea57d6affdd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ef6a1bf285b06299cdb98b184fb97f68f2c78ac60d06a190803da8c3ceed738b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "adb1aa74211ef7984bee82c977f940a2d86298471034769b58d74e6e547b0dff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "16d7fc347dea14ed48a5e25569a03eb9289581b26e35fc6fa585ff5c2c54e371"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c80f5b8d26c270e7f806302fc467bac4aaa9272ea1a5147230beea146b259f09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e7feff3e377987a9196c7041605243264fd15eaca4af49ca6473df4760e6297d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "57702478d2499ad8174d362f5c17f3bb020b7a6521fe4efadc0d1c5fc0f9edf7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ffaa3dba722ca057ef30576c0dd59ccaa630bdf92f69a8e629b8a15bc5ccd581"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b9008ebf5e3332c40aa3e903f7a8f2f45e0ff3f9f9f9a7f7656530fcb921e373"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "07950199fdc8fa37921e24bd4ffeea8a92e53972b141d9a73355ad71c99eb999"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9e464d76d5f83b6555f49a941ce65ebf77439516b09826ed7c93e955dff7d3f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "e69e708d2fff19612a7c17c147957a8d13b813519b4f7647197e46d11d7bc04b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e66c3c35b055c94a06bf4cfbd8a8c3dadaf6e280e1b2fcf37a3a9048fd6a2d8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4b2bbd53872baac38be28723fd37438f513eb5bf626e59219dc78b6be2c86b28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a56056e645e06c48836615af43d0149c47a902ff229c7235b5056c38ce7fea72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d36fc0c93a52d5db5c2628459c71ef023a60f5d6f0a8700f7d4bcf70064acc26"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "fe0b2d08a370e2c79ae55695c566ae6609e3f18f0f68f09392801de71688d4bf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "135b54db5e85a28505c71a548dd70570bea0991e7a31b975e5e86a79104804a6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6ae0b8f86e9c35b3d3fe0f84d73181218fc6e65d2da202e48bf600394f8e0103"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "eb0262cc12b96708213514bd43b2fd801ed91500053676f98a81eddb8b4ab94d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "b8a254ec47551354c77dee9474e414b9a9ed0ecf519f40cbd874b5b150f727c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "28ccfec4786746133934cb7267683ccd656cf1e0f103c31030aeee7c13d8eeea"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "7601c6d5df5da2142979777f1e0e7fb513af6ae4f3d8bdec91c011712b9e5989"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "03c9387cbad4d895e03fefed7750310e198defa92f16b5dc0e81ba42693aa052"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a0b869a7eae80e83cd690d6103c4ad16f77f4ee6ce9edef36e3c1694a423be83"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "b45d0b4b06f888e8b5284c216dd537d00de0ea80cffc17245712223805e7845e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cdfe0986a5366a27bc6ac2a65443e309d95c6a09d10e42863550982d6463552a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c742de75c56b8d6756e845705ade8766b49358acc744baf53d94ac8efa389f5b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6da26ddfa663aed7f5080fe05fdf26f2c59af161f38236915b6cc7c78088c396"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "843a85ca18f743ab95b370957a485de47bfc8e078c8434b1d2d8d2fe889e4de7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5708eca6e4ee1f45d40ab8c260bef0a54daabc76ef357ceefda88eaa7e7fc7e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "811f2a8be97005744ada42ed890e4cb82438719ec5c05d075bb8f70856d7ffb5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "21df970320d7ba23b0356ce015303a25201ee0ac5123b5030b5b6fee908ebe9a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8c7dc685db8210f0a8d838c7da9017a9b1d91280eaf53db322ce815d323a8282"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6ef97575483733820d007d8a2613a19fe2a4ea18aea50f9d85c7adda8ae49ed2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4214b3e015d072c6e8aa854f28c94b5a5b3a7c5679f1648a6f1a8bdb6e5dc7ac"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ea2a63d890562caccaa1d856ab8442d313b54dc98accae56b4d916956a9c7d45"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "e4c5dd2951447d276422aa97611b28f2d3e989a8bc7b1490ad70ccadd3f468de"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8a6ad545b52c369a46c59bce8053a64af710dca9471f221aa37d239e1f1a11cd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "2ba582b7171e9eb7f95d442493b383cba13cef7209a8c035f6cb88c85cef00eb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8c6e3e48d25029406c7514744b2187f7e009bde75d2ba2af27f672751e9594b5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "974b243937187d37bdfa4d5cfe32200509ad3d098e63b591d30f46052d53bbbb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e2f7693fc8c6fbb1bca5ba732c1822b5c07336585a844b1a1e3818fa72435f85"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f41631620790ecea72effb0256fffffb43a510ce74d628776537ea4ca40b31f8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a2eb8e55046f074d8a35b2f6f7b49e6da918b27d90e160295943f4997ea227ac"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c5f2ea2bfebb15e53504c59415dd5e107275804fc1cb5c30b39c585d1a954fee"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6967a474f4d61c5b249193eabd62aa1b4adb6a78ef5458b8887ddc27a1715afa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cd8f275ee7edefeb11f1219389eae1f3232895c95bf3ad19a67cc353224242bf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6e11bb9536a87d694578a5e65c22a477bf96cda204abc54bdb18ae5789ac5d23"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "9dee272b4735baafb7e2c8b5d8a5c6a75aa2a4a53544a1a727d9f32456ad3210"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "de6c6410a84b97129e5e3f59b7c927b53d3c5dbcda8549bf1855369d24ee0eea"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bd39c83b7274c81c7f40a285215c510ccf8d73e9c02268aaa4c8b39f7f9d2b4e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9eb99e36145254b8acfa2c19cf5d4ec2f469de5f3fc449131d0ac2f7baa4536a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "feb83d5ade07ab2afa6a74bc7a7839c457bb247ce60304d5bf2d7481609c558c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c7258acca38063e59ba46b3126119713ba72e71cd70ead1961507fb23cf601da"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2497b700426a9e723aaf3701860ab0c03f072446639bed76aa2f093a3ba351a7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "83eeb3ed81f0461c5fb2e81192e32218d27a1b5f12724b5affbae4e9aefa7e23"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c1d94d2dd9e035b3b1c6d4e270d886f0a5f8cb90b0718e40a352f340b9731b2a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5053b0005130a3015e425ed3a778f978a5b9611189c6075927210b9d782fb46d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "375a48847ae7f23cdf36fc89e101b0b6cd44fbe7390d92e6914f0e6fb6dcca45"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5c57df3828e9906cda7d0cd76bc9dc18f0ec7a0e309228eef2c0b17b10df51bb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ff2a6fc101decd39fcf23310f1c30720d5f9fdf550f07da87099d6b0600dc8c4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f5779871fde0708e84ba5f32501f906709cb94cb4bd58d459100edcdc0984947"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "700cf5a5c6c31fe83f3d1276f1a004339d15bb613d2bd0cb7a4ec5b62295e0a9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "84d5a81c6076e279c1f65a8b8f6f45cf724667205af0442b54ff57bdeb560a63"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "75ad050513c7e5104ee5f636411e0181b99f2155a56ddeb4c9a356820b401304"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2d6ed48a920145858073c102879cbee349fae250da2752b81108ee0ca7e3ebde"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "2fa50d8d90ed1899fd7641c82121ee9d7b730100de2f8f8ca11f6d092dd28ec1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b149aef4ff023efb25dab92b03fe388a8c6c573069af790508420ad91d4ebc10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3b4e0524789776bb8206e75d3329fba313da5bdddda60a9e8e34f026b7b126f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9a176820807b10f588a749383d1c44f23bb3dc25df12e4a923caeab4c9e6bbcb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5a68fe4d3f52972233e7d25ba1cf4b19d88ca536bda57c2a2a3881020e521fa3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c7025ca4368ca2c5877558bf92c345e554722d7dc6b63cc2a18847ba500bd6a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ac9dac8402da2d235aee7b15461fa455658b1bae4294e07ac1f07391ba90a72e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "104bdec218bc21eed45e189a887643afacb58ab75c7bdef306beb3530d328425"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ba748a2127a77a0347795486f6c1c1afa7c8510881355a984e27d143242801bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "efad8de250772cdf2d714404888152ed5c7a14f7012afea154f5bf289520bf4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "dd4ec9a714eedb3ffb0955d0a8a299d99528a506b173fc23e88779051a6d26ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cc5b83f0b1e3ee121dc9f7c2dcdfb17b55ae37fd27462dca98effdd971f6fdc8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "daf9f9e5be485d97ae8aff12d858b1908d91a2122816df8840cb02a9389c2519"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "65ccefad62f3d50738774e6ae297c6fae486bf861db9eb805857af56b080747d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "899dd5f6f56c7fa48e52d18c6314232052190307efde0659264ea8f3bd26ec3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "75ff058e28e391081af0e64633aae0a845dbf2195f900b8c3ae60cb083891533"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f4cdd3cd8d536ae5ea8b2719e25c55c75cffc3f003949ca4259f301e6e8ba196"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9db61c77449b6d4f7eec6532b436260222e8c74cf51d1a99fbb6311dde6615d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e5a08a95565cc562bf35d3e0fce7c237262bf75ab19c70bcb01111a2a3606dcf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cf04c039f22f5fc9b51e10c741ea8b6c55514b32b4ae3aa46c077be19bf438a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "37ba73fe08bf06260d622b70f6fb76459318e475e11eb04e48a7930662edd0f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f76548d39932f5dd1dbf6227931db8febea9864a8ca0a5cb33c5bd9a6bd15e40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "09a630e67459114e94b31b24e3f74cb238e102771b842486fbd6c015db21b885"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "89aa4e32c12e9a400379b1321f984e480d56987fb4ee1c0b7834f45bc67ba522"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0b7f1f80c34ecee58370a97022d327315db7f8dc25c61e78de9dc514bdbdb2b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a0a04ce61b26f2d4bbebd21473089f2509772c212c9f2454754c1130c224c465"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "176d5d860be9cae393482dd75170bfe9d146fab044b2dff583ee0391b03bcc42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "56933b78d89af2b5fc937a729e8803c3b66e5bb71fac757a82df23dcd56fe993"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9695062f74aafb70cd2f6f651cc6912d3c111232785c57bdf24b52b466c40e54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "dbee27129094dfc385000640e5792c9ac4ba94b4a86c6a32d6cb00b2a3789bc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "86358ae483adf6545077aff4bf2e9a547df94cb31e117834af217af47b1f3fe5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ba58721fff014274d3cca41454593271fb7ceb18d728297be3b26258caa592df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "d3e1a69415eb65ed13f10a56548c56039c8b5fb4700b3c216ee45600db92cc6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "59e61278596daa47fc62e428d284235e91006e3e6f33141ff2fec3b096a4866f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "4d21b42c58a098bd82ed43b89213e95231494ae157c4201cc369e5b9f105f14c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "eefd8a2b814aeb23d6375c913980eeb11618b87267479e166fba8ac180df76dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "088d43d76773e80025ae7b74be526cc6c75db5352d1f0eab70450ee0ded62059"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5b5500588afb1ae0b7701fa0ade1a45582d01c22da683c7573c0f5e15ffd11c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "3ab064eca6ab6410a4c5a50db61b5fa84cdeff359fac4b64f6b84c3ab2ffd695"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a15573bdc8e7def60fde2ec9851acd7b1c47aa715d378a1b6999364ae2edad64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2dd434eec9fa0d5729dd9bcff69908a27e4fcda92f90cc87e39ecf4993caadda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f1beed56222ae46a9401d5f2fd19af1acb4206571034380d0de45de600ac49f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0e97990087b7b71e0d2fa9a6bd33a5ffdfb89166097cc9800799f0aaebc78c62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "f1b27f666d475a4ce2a147f463e5590244cfafabfcf91d3e1e256ea4bef95e00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7468ff1a4f007742ec0c55d80a0f6ff43cbecfa9d9ec7b1a8a661defb9a4ecd7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6dd6f2fd0facc54aa0f09405fd2822e08c277a6a122c5c3ccce96085b621f6e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "27730c25edb8153b11f3d1483eb05b0e0ffdddd337c17225e9df0273490003fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f4bb2a4c5aa62fd750c35d02fc615151323852c96e7db6a5b6f4e946dc9c12c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "afe89c897631fa79c40d8ab41c3e2dfb6b74c8c23fe40f1533a60ee0fb679eb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "058e709a9984f5d288d5f036b4e17306e4f25a9f2cdc19455f3357040d57af0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eae703d47ed77d0c618064acd9e894603950302b0cbfbbe58546745d5d3a8a69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "9ae2e3d1f7eecd49644300253d81b1182be7aa6cb8642eda67a412ec679b7e4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5aac2410cf8216e50066d86d26448b7486a213a3b393ee2529cdd0a23648be3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ca752948487fd3173e12b74318515ff24c2c29dab9911caa9571cffe4c8551cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9de13b6f3e674e181a1d6fa83b3379e5d53233c87a8b69260c0435412b59d3ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "73dc4b25103728d05869b4cfbcae3ad27470cc3c6e529e6b1870b94d64896f75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "863c72a6f1d28afcc0195a0d73354da7e07af5da52ba37aca816b146f670fa91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "fcfa117289d78d87e479d74f400c933e85f9b6a3731a98c208ecc689fc8d38ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a5ce0ad9e94fd6b1ef7110f2c98fe6f0165410ed6012d506567f132882a45f51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "07d6ae141516b7d3c9ee1a52d7cd9b2c0fc3fefb1f0ba8a101dcf457c14bf68b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b422236efca62d3480acf30fa6cb08a95601405299111a09467c1a92ef11f0c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6392d187a435e3f4de9293ac3825ef580e925ae9a5e3891f72f7d4a523a344d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "891d89b7d3e1bcf595422342236c1509102e16fc50ce4d1dec3f3a7296fedd52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3c2e4435a35b45893239b522f1590dcfec6551d9e724a528e084cbbe84045052"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "abb4dbe18240b09d2b51c35331c2a9d7922c040f310b9e04841e02ae08bf9b1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "75ef0359bd85686a0d30a884eca260962f2dd5759610207d0924f8ad739d2ec1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b3c77953d8a748a0475aee6c788a58400ae6b44dca57df3f56cfa55f682ce074"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "669bdc46670c721ce56be1f17fddbcd7b9a596b303467cbec628f3309e623a57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "732e0739cd78890ee4a9892bfc5d78e20fcdb8a219ba8d4d8cfe14c1d3a6cebf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f94e77c36416243876c49d8cd8900bfff7b62091047fdf18dd37c33e1d5eb722"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f303ad74edbda689701f0b68c9d6a31f4213dc8a6c28105584d62cf727e7b755"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "8b20864dc92a5fb4bb448c91e95c7a8ae65ec83afc5a62e56e2ad3bb9cfe73cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "706d6f5c5a101340b007f8dfe4f3056171f53644a825c2d7d1b03ca60d37c5c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "17f18a2da5fcd46f2b4f41aa7c138207dc6920b6f49fbefe5f8934ecac4ab42a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8b920d35b9de005aae71e786c5d0f8c2e1ff3a4cf0909657397e3b7f02f9ce44"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b552d88d491de085a8468d6724b425ea5ad22ae3e567a9724b6e7451cf7d038b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e0ee2608692b7ce27a3a5a2c60fa0e45f0424e01d80895df144e788c5a73c130"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "84feaf256a5f34b34852e3da1686eea074bef3314d861e31b42d94e806f45c91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "51e61d2d8bef61988b1242d79becca1cc8cf86d9020dacc9f1d86736fa8fbdef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "e46903333a5fb00aa0aac43bcd2bb8b08432de27167d70ee5b739622bfd5afca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "bddd3aa9389f091dc709a77bc887cba95435ba18abf5e74b6c0f57a9c91cc781"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b6acefdeeab699b3f6f2eb0d0da419c5d44353251bf0083648d547369bacc831"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "33fdce28359c88adfd47829bbd1ace7e4d2b42da776518f3990c747b47b19b19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d5da56e936d6d97fcd7d5e028f441e86bd5937d249a1b7764865cbd9d2c55732"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "74880ad89dc908c813fda616a9b992e2820881515e4922917c681cebc7dafdec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "78d269a80ec7e57a3a439b03f5e8096ef56e3e80ae5ea89dc483c1536f4e5b91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "25b99b3ecba8a22be5bc4d19303d5a43d7d528f065a5cba738823f3677450bd3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "51a9447b33f700b3fdcfea211f2e98eb05389be809d1de0ab79984bac47668a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d3c9c96ad45d607343093c382df2d967cba232d87c432b9d5671627a30f05386"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ae8492a52953a1b671b54a366a9f6e8abfb2b6a43e55c01233114c43c2e975bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "c68c9158c88f0285e1183770f519514df7b3d14710bc59e4f94e68c39cb65f5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "220a7a7960616e00e07c57384faae712c2c73b13d611b9cda562ea8df0c60600"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "004b32f0077bc3b728ff75a45f9c9c99861751868b70e742f55e2a4c114d26af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e334e8bffa1e4b53d8426a1ed779e9fe02786c689a1ec480460fb15b9dbfa0cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ae19d2d2ac022782dc92fac3f42c79e480633422abad50b243cd28e32642b03a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "4ca6a1c9cb86de7e69504f8dd0ad52de40f46f6800ecc701df403598386273da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "edfe01cdc13a53eda44df09da4260243accb2eff254324bbc6d95f945a699c9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6491108be3eaa9d3a305454a852079e7cebd5d4a9dd47c8d5034a71b15bff59c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "15053c497b53cdd92add998ebe5e467e95bd57202cb941369b8810a0fd3d5d71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d902057b573e72c81468ad3e21ab7bc8c06fad6b993290e0f6806ad7f29ee792"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "89479aa295fb50821d14b07ac5e6d39badf6af5952ed8b9e75886c0d23e60269"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "915571831c4a3baa43d58da0c872c6a5cf63a7076ece7f3b635e92b1c7128f77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ee4086173bb5fb2b4827793e61c8a75a7b7470a149abb8f4561d2db7ff67cc2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "6e81250777e27749c126497b900d238adfb9d77b6de67ac304933ff58ff4438c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7f8f6f473ed330959a1bf992f7c6663433bbbed2c1dcad97d061eda933cbb553"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "01e858a32b62c644fa1ec22739fdf1388ba21e9a7524355ef74b43ea14022404"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd398c620a37556d84e69b52c17ba9a4feee2fbbe3c97d31d28ff1225b47beba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3ff194082a2b62f60f7ea6b1152a9cb3dd6dfd8a33983d1f88d965847f1638b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "52a6710a7b5e7bb005299e77e8c3e740999bbd853e1338a29eeb89076fa8c0c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5aaf6b37b523bb1e8c50c42bf51879ec1756c1129cd0b780d68fa6d362ab3c51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "71a4f3d040218f98334725380d3724d3022b14451febea2af9befd6577fa0fc8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5963136e972a354b423f4b5f87bce97a167e45ecbb90d29b5a9086299e13fa93"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d7bdbfb47e2a6ea950dd36d073964f69e9347ac6b44d44113af8e8c24df51341"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d64d27d8c4e1eeec885fc04871b263c052ff283187ec7cea37ff3fa19d5382bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ba999ce4f40bb7dd830a184bca7951dfa8bff71ef8e2fb9a4efdfe178a9dc8d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "3c5c8b5bd002495102e0ebc01d67e6b6efb8d4c345f464b8a9e3d6d0f6e7079a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b2916eba42f5cf8a1504503b93f7951415a6d1f06d935f4abbb8ef82cb9317d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3be5829657bd310d44197385ab64341dd0fda8d85901e061447c309a95ca02e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "a9c6597ef687ca0dd63c6a0d3b187beff05e35f6d3c1825040d7675eb7dc1d16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "362f02e7698d48b396c92659055af196108dee06fc2bd2e7bbb44173af3c53b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6dcdbcaa9f08bb6ba5a158c4580d44b7b059701269e0fa0d1513f880b0bc40e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "34cf51cf50bda51d4c3620578824ed250fec11159e1124e0ef419ab452067c56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1e3febea4bc2cd225fa762a1c20b7b58f1a9cc4104924490227c3c6f96c4182c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a17f0d60d9a2bae580742e3bdc2100825e3e3775ee97bb02c9ed155fdc4c3cce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "f500a5dba46d45a27502eebf4d7c074ed03ad213b15197d6d730f926a730b62c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d1be4373212179556613b06d5da19516f858a72e89865d232d3f58fa2b72b038"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "fcbd6e0db465b8210c69746b24970733490738f73146c574572013b602beffd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b5f134c6a7498e79ba8f825bbf7e4e5d05605ac264c87f0562a7bf2e3079520f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7e821651894ffc8a196bd16514ea1accbffdc74c58f04c99f8741279adb7d691"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a49f8eb61a238ed1d4b24bfa5a438e365ccf22b5b5c115e65e1b9cc2e5099fcf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8fb263b43dcf76d212423f4fe3291b68ef434a6c64d77888679254887386ad99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "add7f579c80e58a158b1c31e6ee7683fba9cc10f218b92d1f84dbb51c5e89102"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d1ff0e1fcf39b3f5c1b16e45ec330c7e632e22fb798aea6e3fec7bb1b2c9802d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4a038dc6820022d273d4d45781de7b1a6eaeb2fbf7c207c2714fa857dd87b137"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2fbd4767059a0fe411b2f9fa3b344ea9e03c9020baaebfeec7d13f7e7ed2f4bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0d24488ff215eb5a38cc37495120be92ebe05a3a6878b24003559a8d2d7ac866"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "225b7dc35c64ccc33d06180243fc945056d116b6c8134c71479cae625f20c44c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3610137ea0fc810d89c5bd9a1c31807ed4f2429ee5ecbec2597e9ffe86b19f2b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "304e87adf8ceb391ee1941ab129d96b9cf7c81990faee7c8010f7298f8438bcd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "98bf26c1913472257b02dea317c02bda92ac1e5eeb1bde1783fac95f61ed29cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "561aed2537e7eaa7a4685a821cff4390f841b30a2a9ca78377daf18e5f24d06d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c1e1bc69c47c3c80db81b39547ca6502a9bb9422073c1d260b3177b64bbeddba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "d0b1ec60d316307cd0c47f96e931ee83233592200a4fcdc4243e6d70a913f598"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "66995087c70642b64b813a1b34773eb69dd9d67e214c27136d24e56362feaffa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d6329538d8fec5b01313090e1ae84ac3ae3ddbc96c0d62823c5ec2a1abc6861"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aea86c6548409f1a47f335d3aec8488390a12f7705fa62a21585469076de0b49"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b325448482814d86e32e6bb82ea17a816552cdad61e1223e4efe32f34cfb484b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "0d10b8333b75d3ca64b1308acea71d90ceb6f8f84a169841d03897cc42a6add2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0ef97810ed94b7ff6b0722160d0f9c6980f7a482407726608d48123d734e30c8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bb06974794b11482c54fa4730ccbf601fed79a0eba283265a05cf565c45110ab"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e64d1bb568e2dc9fcb7c26b8de6f8cd4893d2730d9d34b0c6a9cdf7faa9fee0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5745282ece8f2a11f6ff3947557f5e8a25802861285b7c695f1ff5602cf679bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f061d7ff7c70186556c98c083424b6e701e210285caa1f8a0a97d32b11d57ae4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "39b3703d50571308c760adbfe08c7d7bd4dfffedd2a684b61f446146f1b763fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2d5b67610c0956e5891007397094db6acc754885402d8e7e3c620999b7779d3c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2576fe18a70c1799b082a945ca406d5e34a04edbc82dbc8ad38e6a0c6246fec1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "30b189778153228cc3080c8f5250f192b9950b8a83c171cc17899a4c926ef030"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "360a2ee98ca1e7a2450fd3a697ebf40404eb1cad89d627ac22fbbdbd5f4ed032"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "851293d99175ebfea8e36cbdb70db56542eb979315896416c56920bd672372f2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "174ea0a369ddad5bc63c79507b146689009d0e5001b3bba9e2b082a6c953cb15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f231c5102782e6f55c3b10d6fddad400ef74213788b6ecfaca1b6f10a0930262"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4c3731a229c255180e2383b0a2f23bc888080795e81cb62fbf82b208381e5240"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "58df070c76bbaa6ff2323967339e9374979fbd918f5fa479ca664f1ace49e547"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "70cdb3642c3a1b4022a5a3cbc90e521205e8570c0d2544d478db5be26561e638"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "40cfdda4f209761ef3eca907fb84f09130214cab4b6d60ee69ea1672795bfc81"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "edd9cbf6540157c4a2b9d15161d79b383a077832f307f3afed73819cf665a481"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "41f9ed182f80238688349226b59a19f90106ff4b17910cdedd60f37d4e2720ee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "285fd78c597d80fa9c353a00c3cf37ce07a038fad316b91e196905ef4ac5510e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "1b42f5dc3d6e8f39b15eb98d26429a84ebf92c329714e453ec94c5c23a951cb2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "cdcd8368539d349e56a0da3f917d3aea8f1bad85442eca9e71cac099931a5cf4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3a745ec8ab72d6e90b807cf52d631db44d7b4fbd1490b04ecbdc5ddc8d09f36a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f5d7b6911391be1aca7403b8f0dccb8e0b0b42bcdb1f703ea5e6cde9d05dd568"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cedf712829ed7eedb631e6616b51b20b6c40f7f71bce5aee9ddf4e959ff6d3f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "988e2e64cd365677f93c1b9623883c79cb1045ec02aaef2d20635f36e0195ea9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "128f37705668460ba6579af4ce7bd3864bf474f0cccf9b6444406478f1bd26a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "62f9e7bd5ff48d0d4b77ba80d4b1b722b60762b0ec778de638c9b1a2754224f3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "14d3e3c8c95131bf255fc36e7c6e45773dbab760e485ec731477c0f2dc4a7d02"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "934c2d4fb468df623d74b4632b2a9c939a6f95f849f3c8a1beaf35e669a25e85"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a22a2df86dc7210614af22e02abea4e8c7eff6916545a5ff7c5d8da7ac443c36"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6d3548d893121af62638608ea3a0b6ce0dc66243292ffd6324477b6ea9a8c7c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "edde20adba8bf537b44729dedebcc263e4d8db0f9ca0754a5ea3971f73e13f69"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a3a49eafac4fde1df3976a08d56a347361c190ad316d52d53b7e115cc985171f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8fcce917d59b60c70af6ef18e57d57e238b15a28b4bddb8a63ab683f86e3378e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "523fc0b3b8827a56176e7894ba229bdc7c824ad54c0465568042f710f6abcc82"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "88f0843d0c1308fcbd0729b321b0a84aa6cc75f19f7b1788371ebb4d268fc3b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c3c5bb3f3002b8c12a154ebf967c05e8c58cb0deb2f85d83b68a4e2bd5b84982"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "691fc35d4b75894c36e33d3236cfef861a454db736a68756588c31886621dea4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b346e12d9b29ce92015c55eec7eedc3943adc616e196ebc8c65684ab8ba211d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ba50b9aea73f4b0a9a1fff264affad591a1307e07d3794e8f018b0f7d8c74993"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "84a6cb586301499f6bea219ed094e0c87df0217757e940c963a3a4208dedbd61"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "97b70df2d05f5063c18437f97937d44421606dc2bc605981e0265e1c9f4ce81b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fd0462926b6a21909d90ae77aad39edfcf53b1afc777bafe0c56887cd9d73f19"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d86e163089210d1517adac7970151574a8c7ec050f55ac8eeb1d01b4091d983b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "af05a175e01e48825c7a4e0e53b38388feb464440e950e0a0e4794538214f631"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "28d3f9a2353563b88f9ba35bbe4a53b22ef53e614da53b426004d06ee64e2332"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4844c027bc8b7b0909276e6538cf58f9b444e5930be9fb026ff9d815df6f7ea5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fab3d3b7c808c4c0fad11886427802bc38ec8316a723fa064eea9dbb2547379a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7468098ee692f39f3afbba7a7e7eecf095eef3c46b67e5ae78dd0227adf119bf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b164b53bdc9f265b27bc47c5aa7c4363ff83fac3e88950ea549308ba5a5e0ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d9fea5f24789a73af71c33264f900a0a084619099f35d7aa168655535881b603"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "75372442dc47d1d709bad3ec7c53051c9ccbdb4092993167e30de7cd38ec4d7d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "79f17ad517131c43bf9b7fe3bb9b00a55c31a76e7bc506dbe5b45315df484e51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "be2d3ad4a316e2a084067d79980221cd36d5ccf0d7286698437eb2cd4fa3cc9c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3d096ca0355ebdde61032cf1c64b2929f96d69db5f19fa8c9ef0eb1e7f7d222c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "00b046486501488c634cc21e5c52fb203fbd0a3642ccf540f01da82e9b673de2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2b0bd4206a8b2992313d99f72e6b9a63926815044b32ed11c0278ffb4e3c17c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "25430c0e17ab7265bc64de4505af06de58f3c937c0841a6ae126ebd8f20e4c3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b64c18ba0c89a875343843fe48bdfaebf78c611ae0991b29e74e172dded995eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0e7535df2b4da1d28171a268520c5e5c315dc10625b927f2144601fb8fafbe65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8bedc7684f06091f019605565a03c006a1ebb06f9fc52e6a64a161042816d8bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c1e9fee0e21fe54eada9e9498e268bc3c9deb6a9bc0ca4a1dc16f1f25ebcb323"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "38c619de4b7e892e8dbe1e9dcd6d5cd1213c0eac5d419c3127ec2139de8feda5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "728fbf93a0b5da82531843f1dabfa34621daf3ff5a4821707748072b79c5a626"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "43de26822a86d91e1d32c089218118b6f856beff1d1fc8484385c4d842ed6279"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "30b7f6ea197ceaf087fa0a1d66dc21756f81b6a8267b665197f2c96d7139646a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "30a5b34cdaa7fa6b304a6e8965329570c2a4710bb41fe6d8b33de40d2bf73153"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ad72c0c73ec5cf21ebcc5e0487309c55a322e68c0ad979e42b42912e8377f5ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "096018aaa2fc98018ea7e485b6e24a8a9d0d4b70ebec4c22a30a35e004de04a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a151f63cce163b64f5bf01120517e1cc3ec14214d8a0fef41d57a0215a2e39f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "33411bce48f1d810602fc49ed92324bd904eef190ca19edb5947021fb5b64169"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "de0b120a994e0c7d94d40b4ec3c78007ca45536867011c58cda26ffadafea397"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f72b534246e9d5c10c34369557ad096bbd2bc13220ad1b0dfe726457a9c35a27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3016c3485aec6bf190897476eee2d83c926def5c1d703f38a105ef4285135f4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d642ca653d460a0e0e5b47d1aee398b13a9e390eca766a0e1efceeb03d12ee82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ce99dd160cd8b68ff089326c56218e8293f9229bd0018dba5920121b13beaa45"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "248b34b50a5991cb56d5468c149188b9782281f61e0ba987c5b9b3b08cfcc72f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "15c9fe05cf708391fd5366db20c132e5d7ebd3738c9615353ac44aa096b1a9c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4fbb23e29da3a87eb2e8096af063a96f124e51d78bdc7078021ec44ec171bf72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "05928c0c6af7af63790b6045624b8b07363578f201d99173ee389d4b1e23fdf0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca768f2dd7d55554ca31e719474a42120084c513c59023a63705118c80e9c919"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3d86b6fcfb862d696a1ac4b7eaa8fc5a3e8bcc54e85ddf64b1829c17ed299559"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c6f1bf6be7f6e69ce727aeeb9e6a3a1b8bec3724eb93eba0642d05cd54b2c6f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "091432db802b2b98d120cf3c42cb3e43e0f8b65ff7749fece54eb623d1bb87e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "8448bf00d42cf5e0413494f47cab45d228ba1a0f4f2d23b8cad2bc6ab7a5ad16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "07d0858275796e475f7a67f3863bbab6a1d9e2d5c9bff38436b593fa5f060c4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "17774a61f32e41d9c79fe01077fa050556bdf9772c737889a183a88d61414583"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bd9ce5f85e27131e1914187756dd073be1e946456ef5c7609890c1320ae61dd3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a52884fd149d0941e4e9003ae0d1555edd0aafd1e6965f5bd9b6411c49d95e11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "5b6cd89ea5356ab0a71fe782b8dd14e86426c933114f2e7404fe901312e1386c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "9d0bc5a0a0760522873ba71e5ae364c8c01d0d697c180ddb4c34686872bd78b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "654f5101e202198fba0c7cf0213b3ea27d550f183b84f3016c393db6aad0771f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "db2a8093e717f5507972c7c8dfd974f3aed7c1240892998846fa09b5fc33425d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9940c721d27806545e22979c70b74fa77ec901e253489bbe9bc3efb154dfbb92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7e46fca53b58b0c88889b1d54f07001835b526dca41e16de55db091c61d488a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "79891c286236f125a415496ffdec30b9f693482d8153c3d4f6c9da2c5531d4b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "b0bd4ab823dc67c88fe798c4e6ee75397954b9000252bec56d3c0822eec88382"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "43ccebdf5afb652c0014c739619b905f6a4c23a9e4181866ee598d84982409cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "47eb66d6637ccdfa086d8d5014433f1fdad0b49eac00e9018600cc742b78c343"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b36a831c94558ee1c75d868cf2bdb27474cdfcad9e233743fd9130bb3cc352bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fc6dbc1b0d19ccbfed46bf156c6f4d5dcfe4f117c4de3eadafb7fd9388fbe6bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ead70acf719412e6d7314a18152565dbac084bf9e5b5e40b0f361036e5676e5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "62ddf1fc0ad2c695882f71b432ab312168d684455a7de5a8cb4fd18dff39a52b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f29798b4b467211a096eb1547a123a86d7532c9f505d9833202b5130dd412234"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "46784c1148adfe5d3e54a9ec8a8a8945edee6248de64890bf9bdbb3323c7448c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f4b418d98ea96c5eec974ef0fb11d6ad7a2a5fb01e7f31e0eb0ad2607e7faaca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e0fed08aedc63377b016eb9d51f56c4c0778ad77bd7ed9065f74b06aa12adefb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "784cb1116acc421ee239469372c0d3ee5f6b5248c5289a6aa1fc46f8c1351311"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "eb50c70feef78193e29d4cc7c18d22bfae5919523fa5600afc8dc4cea8ce50c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3d9743a13bffb04ca16631ef9cc9415b4fb61abd85834948e7b1ec19013dc290"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2cce7a54bc71ed307ee92b751b245e5e1a96045d5dc23ac9d0272e8b18e14b7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6faf09ead6025da5688ea9d9497199c96c50112155a3d95411c0ae45221775fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3134f99b1884abef7d522b565dd6eb1f6bdf4d97e968846e7c056b7aa32819b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1675df4c12784931d487b76785f477286056c94656ff06a12c3a27bdcb5b7d4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2b5b83d88ed18b9cd246d148750f4dc25d7b99d5f63215eff8828fdfe772436f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f6d9c1cda8a696f3ba8d0d25e77f11933516d737295d5d72f81039431ede0dfd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3e2d8eb87f7b80fa5520ba90a81bf419e5479fb11b7eb69570ac29ae139f2664"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ef48511e3fc3f2131a45923e22c99d730191b68c0654a560cee28c1f839889c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "65927f03e0c8fb2463e61185eeb4459f7eb6cf3a5dde60e4a1e059443000103d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c1dc8009edad2b644408047c148b0bf2397b7e86ab8d0d444aff62f4ce099754"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "017ae5733367251d3c9485c52b03584dec67dc51840d766e2b52dbfc7c0311a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "50d1f70449d09387e02d328657d16fd756ae2eb43ae2cce366bcd8a12ddd8cbc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a63adee229ad5bd427701ff68c7f659c720d74e814a8feb4eb6ed614f1cddfd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cc44cecd73dccf2a14f0359197ea5988f7ecae9bfb5c7d1828c0df6b4c85e9d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8ffad205c54903ee7f8479942aa7008c871c778cb6d9e63f5a208c4dd0d07b00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "06bebcfde741d33afe83e3e480c2ab55f5ca120e22cb613e1da797d745d35cc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8662952ace08cdab7261f5951edffc716eba6cf5824d921d0e35656bd0af5590"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "14b0fb3b21b473584a98440288bfe0d098d55f197b2ad3ebfa27bd90d3b72300"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c5bb8ba04355cdf068816e65ab590881bdfd181dcda163dead26020908489c84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "52c4499452e185674a37a9d972c3fa69410edcd97199d17b1016ffc3ad1a77e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4df2e068ccd0e38d5cb2380ce507c695884ee03db17179573ee1a2028f22b3e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "44fd7cd29958d6630a555fdf2f63754a8c162e20f63c64e97a0aea62b47b7a19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "565c94af7b1302931ff0eb325e691f61af5f9959079028f0914e6605f67a768d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f8705e7c1ade0fa1b742140f26b99159faad1782cf32d86e94b7fd7b7a41d012"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3aca6a206e4b75966c746275fc543a5e23883eb56887c9b1ac5eba1c54a240c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fbbd81a80c4d5a583c285b90d454c49c114e08aeccfd4464fefef4b116c18921"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2c904baf842d519d6694278e02c8d6007a8c9d269bae98649c4f8815eddddd5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e2e89c166b11ab40dffa4c17e6491ea1d3d70ddaa37e2e458d8a8fc3659ab777"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3c1d890a72eba4035a1ae299ef88d4fa9b49339bcb06ffbc400f69a9f22d7682"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ce8d1aa3f4f88ac82f2c67fc265552077cd06bfe4ea5bf7b9a183291c4236244"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6a42dbe4131fc7a832ccc8afad002e96568612a2997a7e66531f5ec714dc3254"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "9acb343e29a8fc3f5b7d72d2d655aa853e2f8878e49f63a0341127044f0aa7cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "98456f49c3b0b96e4262759864a2c0d00400c5d420bdea2813b7acb667717739"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3fea91b0ffc061f792eda7469778f92297e37b841382af54fd06685f8f3f8939"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "12a5de4615caeec946d1c5b05077c9b818624cf210952e297ad1670100192758"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "086778dde4f1a41c3514ed1ed93d59452d0edd8897eeaab0b5ec7a548a5d3d3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c4217d7f363581ca06c28032f675214950da9dd569ac0b68699b2b7210b645b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c6e982d44fab3b555a12cb80dbcd22098c6bd38ce044f051d0e91cac070c1d2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "29cf1d6ec224753c06deef3936dffd71d031d3f371da9cd6cfc8911cf3f4a34a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "734ae1e9295868c9b4ceab477a37952d3e6cef1ccb324e5ec407885f51cdcba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "43d755cdb6868740881da19356a9954826aedcca3361e74cf6aac714eeea568f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "521de02ca65a70565832c1e36ef1cfba2e3732a288b295003df796e8a1811f65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "890f351adeed7cc61fa32f105d9835fd6ead944170be82463f2ae1d3b4510bc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a00b41e480b0b996e4cabcfd9153bb2aa0cfe880e44e3cb2e8a09626438a7492"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3cf27418b4bd2f68f77c72cc2c4727ff16d22a347cac3286cfd1dfb8e22e5964"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "692b6d122243ffd33a3fe4c2736223b7c4501060ae76f85892c7d3d08a6ae14d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "fda2497590ee4f894b71570f16d4e30dbf756400889eb00b21251adff674dfba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d3795ef2472b35cca72394dd3652e80b697e050757c3c081b270c6333898d067"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "da22bb8af036d7fbfc381afae1694febd958fb87d419f09d23da1b30a43aed08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "39929a0ba84a775e65e7fb37fc6451cb30ac94101d1ed3e85ed8c8b639baae37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f7fc702d67d904c4303bf60c4963f3a36e5bb4e20581682ec9d81aaea4b56233"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "593534de1d7d1184065a67cebd293e0f9bfd6bc7bcdd7981b38cfbfe78085b85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c68d7d13d63258d49c819a2d63835da3047e30fc4c2046e8aab619b4c60fff8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "75247495cb3e8d2217e404157625aad7dc27c17db852c7d94a651ae921ea0d05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4bd0f0f786dd934611e4a0c125da71b24e250e6e258bfa13daa2d0e8b3f5eccb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b6010ef4ef174f56485beca5c34cda23bfb9074c66eb92f520ecc921dbbc4dd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9e186ae186618cd715045523eccdb97469e0ee3888c0373696f6137f0d9a49cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7b5e9af6391281959ea7cd14dea905f9fcc719c90a49d7132e8865d4fc70a7a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f1b66b7fa225e818fd5f47ff4298e84b1bf36fbad76b35d401f488ad6b394569"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bc486d0c8030a057f509b671c2dca7500b63e285f13d58154ee7d5eea432fed6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1a5328d9f3000ab01f0b655440e3f381163d47a919e6865e138d877d6a1133bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d8096296aea9785b9f29889244a74127c0a22d616cc24ee4b05593b404a9f793"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "30a234389152b52c41e6742863dd64249abde514469c3936a1da2ab524ef7fdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8bf67862c52ed34f6e27bbce85ac18222fcb9e96d94e842947625fe517e179ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d7edd2b5cee281bdfd82bbeee470522c5d1083dc5a0bb9b641d9476545160247"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ffa1e0c8c4ae2d277e43b392b8a6c58ae19858168beb96519793e97ba4806276"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "36e133cc7d4be469a9198be5c042636534db7cb5107d004ad59eb0907d7c8f1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "110fadc31f93c3faaee3e589cf978e2fe8df57d6550bd5aa3bf89ee5050d336f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0f9a3508e56c111de3a2c44f51d6f1cb38cee388fa8ba6002b13cedb1b0ec9ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "add8a3e810f284d474ccb89ae3cc4777809312b661c491bed2d27f048032d24b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0098118e55b9bb97cdc72932c63fc900fd04359f8cf070356f10a71762b0e9db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c8dce4ce9cf029588bd2e4aefa26462c26ca9f7a2af6b0f3f1ea0b35d4a6149e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "de35652f4273e1ab0dd9b62767e1e9600cfbd949b3ecd8085eb203eaa1e84eda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "305773b02ead09e5cc1c91f9f66a382fed595feea5334ebd2d4febf0eb82f735"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0b93f9d285e124e338c869bdf405eed222ef420497d9e6308e113e3a6fe6d6e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e88918e64a3f645fbc35d7c31c12da173c7dc9071bae338907023eec07e4fe6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ab133cc888d89d58b291a08af38a277dd554db10e937f4d957a6fc3d423761c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6c3226b241a4b60211039127c4fa66a35a233e6ebf2c6c8a3c909f5956c8f30b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6d71cbceb2e250a6416baba68380a3dc3fae0f29920cabf98faa0a07fe710ac2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d1c24b376660d3827e1904f3460c81577cdd828b32b3d7b0f4c79dcd142d63b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2182a36ccfc2c6a7d2bce29d8d2617fb7506516650885750974002e96e841d04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "cfea90b58576b76d456c5448d44835d1f8865bf41a82c7f8d779a3998124af7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a64cfb54e986a4a0a55445909ccacebe6f95e8744c6a6ad31283cf426039e4a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3125bc998f20bac6f18139da940800ae165228c9ad5464445519f2614ae6fdd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "42d765817be68b343674f4200f72584760a7b8d8e60584f714338e890a785f7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e0629605585a9dc3ca29c94e6d3ecb090f7390aa7a2144dfa946b5cc3ed51a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "07121132ab2a614e78a6830fa08219d5b371886d72efd10395a02dfda23b3dfb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b1b4784f94fc99ebcb5b7cfbe5eabdd646a2a614166c36a8c49568052a49a114"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9e78013e73a51e7d172b799893916cb7c8ea77eb5f1e77a6e64fc0be2ac755b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "bdde2454c5bfb8f89e2b4e3d7a3d46fda9a2094d0d85549b859eee0588123f9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "1d58322ba771aa98b7c3673ce29486e0fc61f0f31b05e3670b051a3d9a9f9ae4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7ed8a8f825163e0b7d616dfcab21e70fe4cba85cc4c3b20b96ad062a3f0ce84b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6f6760684dbf8003eec4d1da8c4c7e7c7d16cee2b520267eba662fc6c07a3eb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "13d3b2de888cbd6b55c1171ba7a864977fc0ce5b9c5118d1e06b4b55b8566b6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1f8f3b98d1e19b952eba0bb3d69992fc3e9ac544352280f4140d603757656c32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "98c0b51f856c3b5bee1fa54df507a972a514a14b8ecf8d510d383896dd909ae0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b26fcab6ed7ac07620491a4984d09f258dbacbd1d69256b542ff94d9156d5833"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0148875e458bc35ccffc73826c276994262b45922e119724056516d32a246094"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aa5e40cdb0ddc1965baf8a0553ddd1e369fc9d74f566d2214ff2a0d4db410942"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b22913c662351ed92e1ba0a3f3885e5c095ad6b6ade755e05a62257f50c64c87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "910f225f987bd7f00f7b7b65454f5927968b18fce3c6f980f786d91ea23b631e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ae2edcf5436a32556461eae6dfb3f3206923d62fe9ae89cc446a202828514cdc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "fb8b8fb60a3b4687cf5e6e45ca09461f8755e8bf136ddc0d10e46e4a8e201434"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4620892c4f107aefe0c14b1ce834bd9b3ebfa8774d04f519144b1904ec039e79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "89f8a5f44ec289b2117f10ed183983cdc4ccaf175818c4be0f8ebc13aef19b8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a50f858eab3a7eb7e46558588729b40cb4a2f9b1504104fdaadb610eba32eaac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9f541c23cb94c5f07f373afe77389acfff234285c3ad3f72d0fe5cd94bf0e91f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "43576a7bc2622ba66b34c6ed8ccf3e99c0b63605aecfa298e3491458219f393b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2000aa4985ffb56c987879d7be43a3e4249f771010ef8e9a47b10780308d02f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0aae3897ddf148a43d8dec8bdf233abf0739f21dca2a28f3a1ebc18940b8319d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a4b3002d7bb2131c3e2ae2759591371a80832212af4502a6212e8b8551c67dc8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "41d810a4b8c5ccfd136174d43fa28761f2839fdd8c379d254d2ac756bb4ad965"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "050f3686439c173e8afb9bb75cac264d23f05c24aea7a1788e134d5db529dda4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e3011fafba05b8af8e4b2e70bcb15b50af3636584e6d47b2d091f5d4e1ccd4e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8cec52b70694ef4fed6d485f0b6c1c1a9f4d3cf695e5c94d3c37cb66807b156b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c30fa0179e20e222f3764b0a826aca4bc7eced3d9aaac0f7001d53faf2a0d144"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7eb9604d6a3e54e4073c1f436f2ed2b9bc91f4ebfba39966168ffe6091c40876"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5444924f93b394e9c7d00d702675a48d6a0032906e5622285560cddf8f2932ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9ce39bb886e9682bb5b4da9aa69821035949609a3223dd9d9f5955e661edd27d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0ceffc9c18430dfe3360f9608813c70ee8cc566d2246c0d197fb4b8ed2e2a165"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "183b55b7beca1aa78103a4997ec17deba44258ec5c78d493467501ea6f2c0db5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "654ca58beff79f76d648bac3c772d0872dea61bdaca7aec1eb502889f68747c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "8850c7eb360b45aed3b3932732a7b0d744df28ab646508f48247729c79a05b77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d4f8f441997f4a8992808079f82680ce744c9cd22c023c343a17b1fd5af5788e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "74e3e84be6f7af61f793308d7a6c4d94e62033787aa206b2d1679b398979a2c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "77a15d58f1a519a7ea2c0a8f533c76f5b32f3fb3caac4bc34efac95d912ba7f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c312dd519690c409cbe24111cc105c8bccb42cf82738dd69728fc520d714605d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8dd12d3493d76d7841c887de10c722a8d86e91d1bd6002b318e9f2026fb90414"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ddff00b269b52581601694c3f49c7ec304ac157ba127d4ca8e469d7a117d028a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a5b881a7aef755621fee6a045e7e6d0f01f851d28204022e9cc671b2115bd99d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5685fdbd2a78c40b1a3d5d71573ef043c1dac6309a14d7f0dbdf1adbe30d25b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3bb2083f6a45d45d15e155b472602b80f3467394d906d2c3293825690e334b9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d581eea5ccb7fc64288a6e3f55e510b277c0ce0ab099b744f0a0cd9bdd79b9e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6063d1d036bc183a6b8f23a0bcba4e7e4c132545e42337e30e556748861636ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "897f3cf32abdef81f60916bec2578355725468ea669b5726137585cf6370355a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e82bae7ab0c0cecb0974365b6a28c40f327177c434025871c8a55a29ecce73d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2792c2339a973b4a7e46465084f26b24a2624f50fa76cf4cad7b69754c735481"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0d2c6c5b7ac20e1139c374b28d248677579d733c0a194e81ef9a448ea03962dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "af7c49f208648cd5e68d635a21afb148f4213bff2b880426ed1e9ee2f5943108"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "71500417ab4bc8726bfcff3d9b0d832b5d5be807e251770264e8023c78517b23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c6e954c0413245aa57ab8246d9b13e2c2ffb48668cc84a0a90bfc381da170fff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b5849b44d0b67452731bf31f13865c174acefcae3c065a4c15cfb0ae299e622d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1d6622ac02d98a299d01cc9672bc0ff409afb4de43c83ac2f82a36a9562cf960"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "99397875bf3dc892731ba5fa8c3f5b1f17a588a013539dcf07bf7016413cf032"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "cfc1d94156b7b33f0e4438fe8a15445d45591abe23720c044621e378bac8f4a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2bd42e914aaaefd556b16a1c116e5b6eeb725d4d1300b9a5c9cba396e0591c6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d431ca520d04fff910366f941a6f90ecf94abc5bc1439a04bd668782c915e979"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "65e1cda0285e8381ec04e1195afb5a96124e451cf5a47038858f32baad6e102f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "71ee27f4c1a45c321e92e63f063e946e6d5dac32da94b3b41031be6c58d73d2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b12b4fb38cb5ae7a06b89e91e2f2682da2944bdadd1ae0b71ccf71a362cafbf1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9d4ae2617de40e420b6688d972d57e158307829b83e61069d50f9da55f006e18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f0dd2127382a5befbba6555b39a67828734767d15ec1293a6eac9e9377bd4146"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "51733e9ca073a2fd83a52323d640993d3e51b81d59da8b633e68708b5c058e07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "cc0208bd2c6667a5035b52dc665f222b8570d4674e1234c4a1e43e0cb3a62cce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "068ec55e1afc1255e3019651f79c0397bd17139d61d12915c0ab0a146699a4f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bfb842eeecafaff0d24ccd27066b0d0ec324756782496204560b6c1c29fe9655"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f797c65d3e7d8b904d46a788806739f675a28ab417e94fa8378713f97461fe3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0213a682c6ba2cd0fff98cce71e652b0e7b4f85bdcd412b587ab28ea0f424cca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "04e47b8e3f80c07cf87ef1d5cf926a69f366210a4e4aa671235d3941930d0d48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3b7ea483a4e76bb008ba12003f96aa1ffb9ffd60d402976a990764b274727fd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a016641a00963ec80e7240695743cd1156f94a1069bae597cccfebe5ee26f16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "352bcd97106ed61be338bea371574ef03636662d77a688342d782a5b17fe799a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0b1743b2bf93c531ecbe9ea4109feb3bce8e92bf0ee9bf9978eefe0cd8a6afff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3e6ab2572c61239f2190848118b2469ef2aaf317a77c97bb7f416ab00a96c814"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "7c1826de40a5db061844f1a35fe055ae7059eb8ce44cbd3b9cf8034e94fae572"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a1e6d7e0aa1d0415180abc3bfc5726f73eb83f9a62f4bcb01e897755b2ba8dca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "859f3f1a7a5796fac7b67e52c24a193223ef9e9f18c9623ff172b27323c8e639"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "15abb3a291c20dfd5b35e5512bf1a7164670ec177359f49920ed27af440d8c9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "40e1505a085d8b60e71f280fea917fd7e43fdbcbba64ff3a9f83762b439da4c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2a5bea1783750b354da40f249ee1a73446d2a1aec65d0295895ba0ece9a50dd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "6fbceb7efe4904474e12257d671b5e4823e5a32278a7569d8d89c04eb110aae0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b207f39712183f7c82e2f45495c2e5a25a5ba9989e4513513853f3d718064cc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b0315f8b239d615b582051438f9037a27a55a283a67a445f0863ba838ca7ee03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6bf157c8e0b8c785ae52b61f840c276285da910ea71e8816347b0c2c97468773"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "9ad4f354486b42eb8db9bfb84fe58c3016edc337fcd11b413eac23211f61baf3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b042314fe52bb1636fbf256c17ebfdc06eb47b53df2ddc1b30c915d7888f00bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1b9ebce31f1d59f8205eea5b086d0135ea70188478290cb59dd377195e523f20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "85c6c4653149e214702b8118daedbe72f2f575abccb6942281a6705ae0541485"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "79f353f2c1b8e25c2ff52c38843aaeff4e733a067728f3407c8ec17e76858347"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c15139d4a1cbc8934053dbfb0e940f67d77784780563812a51af85bab3973125"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0924650423561acf4606b264cc5afe95ec4ab5787c55482d7f52fdaaea8f091d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "66a66d41aa2faf8cab56d3bdff104f2e3e577d5a139b40a67a966085ec5851e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f4a825c23038a3cccd65658e3e8237646e3b4cee20003657cc98f975620074c8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "aac573a70398870c52400978d8f9752448bb76daa3eefeb308b2780a6ab3b79b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "467cc670546b9ffb4fd72c9dcb9ec47d309ae45c36b963bd118aca319b132040"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0f040a61f6a8431e543505bb26ebd5118824942fb1aeec2a3b4084e97b4033c7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "590242ed84abea4a72e39cb766d19b6e9a7a591cd16c5b63051dd7d0cf869760"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "23d200ba633cc282eb880bba15f4adc6337d6f14db3d78aba7912fa3555edf08"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ee1f6364fcb88f7afdd0f54b928db0c40f09da064aa2172c52d1e9128dabcc4b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d99f6e607668c372bf88d73ca331f53d6474208f33445fd05e08ef93733aeaea"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ca746cd641c4505a964dd7e577db63861233d83923c15631087be9f005d53bd2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "740892943888b812359d8565184ad79817361b6c8235939efade9732f0e989c4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bd0147a41419ef8b97c3a1bb8376ac95af0af2f527ff685005b7980cb5e17d94"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b9528a0580b1d286aa32a463d4eb6db496a65aa85bee311d23785c03b8f8b7f3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "82c3f927d1bccbbcabe0d058f0e3d5d8f157bf38d7ab3f80514a26f88bbd3c00"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "367ad2a08c41a21f5dc9ba7578e2aceab9ebf58f69034e40473fac8ab3b6a647"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6256cc9041ae8fff88e876bd57fb8c6492b04a16b4b2d774183cd789721a7bfb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "71380233e738de2b8dbef1ed978e5dbe20d06c39966397aeeb8dcfcc48accf10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6b851bd8f4826e458d7a9faa52f122721340a35feb3696c85f202e97971ce05a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "255981fdc6db62854a539d2fe6db24a9b7800bf91199314eb7710b209f43e5c7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "556f00a3aa41aaad8624a3e72f8a3e9906b971eb097d5ab01914763040a8c27d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "05fcee4b9cc1ab02bf3835fb630068e8fbf1c422c64c7c67629ae3a80cf7a1e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a92df6ee417a893872240535368db021cae626d41be0103718a33fecb52aaf48"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "53348ffe0f0c5f8c645aedd952c2071e93883378ca57f1fbc5786d4b5f3351d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b00ea14835937bf34fe30c01e7eb2abcbc9366c79cba6859955fe298287f8fcf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "e783c3abab7944891e2161be47f3430a0a94ab7db09f5966cb1a96b74797c769"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "50594f59ec2e4c7c00fcac0bf128559ab95d1380875cabc26a924bb78509802c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "08393f906beb50e1abf8dbae06b36113f35a46afe203a19825e95c93fd1a6283"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9027df64c85bcef0acc2e75342724b1697ffc05c484c258b391b86030e85b556"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9aabdcfe93867aebccfd3e01788357161009e953b1ae5658e76858b1a4653b05"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "cafe95cd5680970faab01fd2f1be2578ce813f18f2201b1851a69cb72f5b7376"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "022475ec2f438667c63a8dad02d234081e9ac33ae30e14622d0f9242464ac720"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "496dfba2d4b9c8a66b668200d4b8189eb7a3d02684adcbfab3411b5d9af45f69"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d325d98c22a31e0d4552eeb7fc1918f05b94a875a0a43050866fec776c89b997"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "98fab2c21d54a75cc38492bf0eaa25064d7b36ddb455c847b4bf161881903c40"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6d1a1ef7b89ccfffc362f7d2cf2714f93b5df962fab37dc90a3f3726723c7acf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ee68e4f92933c7a3f0b431eb7637f4075786dfb77f50f9f97e73afd7ad1d7a11"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7b5d9180d0f03783e0f6e06fa051a510684b6423be33579d27a09144ee47b0d4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2c61111cbf0aa34197ff002b59e9c7b4532ab4e26a5a95fc92ee725c53285564"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "15e600f527f6b73d1c85fb7f45360f3fb5c7938f16911a0d50dbb7d64d5b446b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "34f2cf5a672d74e758c81dca117e71a1fc3fc7cfe09e75b78bd9b459a7294f6f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "aefb92198b6a754f083eda9dfd9c4e22ebcf1c8fe2601a2063c73ed1f20f732e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "97551e7ac60628491cc5c722f3acc155da2828fcb10c75baffec31faf717ff4d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d23995e9fdad42ff90c1ec3bfd6eca6980d580bbcd23ce3ccbd24a076070d22b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "90d0320e6265ff6e74c5cd77371c35a8305839b1f347b44583457db6b3b45709"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "53b672f76e70c5d2dc04de59ea0bff6e6e1a413f496d472dfb5cf6813800a19f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1e4e4d2e7cb2cbf3e000fb18999b993ae909b4bad4e68ceb7d39b71b53993293"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9d78be211193a14a38c03a48b688c4a09fa2321880e861fd100b5889bd5e1140"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e7fd524f7319e85fa49122c802351cbc1a945e1d33f6e95d7abc087b0aacb992"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "48950fedfb3b7209d23f3570ba471ba8b8fe483ccff31911f1fde6a94a092024"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7bd5d3c4e44c907270e51cf981fa7985bddf21149ed8fb6fe3cc67557fb556b6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0d40c5aae4dd5e34d95db8abfeed56213b25c92a3d4329e64a79c8c952b83779"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "457fd72674677ec804c45e58a17b23af37f41ea0ff12c9b84318d780289711dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a5e289f2822864a6eaf7665100e352f55a4e0774639411e1ab2a83dbbc00d4f7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "9a08209488f5387fdf6b3207fb74ac76fd3aa98c21308f7ce34a4969fa4cefd9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "14de1e74606db11be54dcbc7a4a8c0b1af700bfdc94987d25ff79fb7b639109c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "630c13724eb6557597ebfc2c2df8bc7cc0e754fb3f624220e25f297144d0fe26"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "71ecc9c0a8cc63725aa35971fddfa4f7b019de673115b0398a88c514ab508bf0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "eaa4ae198363953d69a21794a98eb65ccac700bdd27a77ae187789bd37c11365"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "47d06fb3da44c55356d36d7ffaf5274eab25d902051bbe4e6858198cd408fd51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "756842e32310b3c2265d43f2d8dd8b65385daad8d20e76b2a9bc77400250debc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2be7ac300c1d79f00b8bc5dff55831c674d599f82e7ce822f6fe7f8e8e36612f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ddbfc1ab95216bd40ee85748c147bb2f59e898e8895a0119289feb01cf90e068"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d3699a3f3bf609ba91575c5af9b96f46b0f35d026d5eedd5ad29f5c0b4f14f4a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "21a36164502f71ceacaad01a7fed899cac653eaf55181497880f7717dbd7a607"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f2558adc88c76698b16934bdb91ff87486c13fceba0cfcba4c4ff7380d6cc2dd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e5d6f0663bfb6568600995567627c6ae753e32556d7d2e148e334b0635e2acee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b18864805ba0e8eca0a196d651f3c176b02161bef58c1f4854720e221bb6cf73"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1b249aebee583ade3e98a276645e2d2d8f356f9e9eef5edcf3c43e99fd17e140"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "24f5ae34d1de8d0b1858e9b202b0c227034cd62d1fa026df637d596a8e49214d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "361325f45f40f1470a788cf1dce4a014dbaad39ee5407f7f3fd1af9bcd9e298a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "30835e85d3a8a2b438c90414905ccfbb78cf2cbc95b942e683e4d3b631f085e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8c46c404a73582ec1d89704f1ad942b3da0fa090d136074c756273a8d8dc75be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "64e870b36b5b0f3ba4c8745cc05a1eccacc54ef70ae6253396572473f168716b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c62c25d21c1adddab217fc0a0d9fbd62310211fd161d7827061bcb12c0769488"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "4ce7c905f0d5267fbd256dd5b8a6355d3ca76f6a7b60f0dd94accd56cf8b44c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "71267f1c2bf229ed3ec3f0c95ae02bb8220477032daf77564ee17969505484d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "34d28cf86de04141200d4a1f246168b9be2178bfa05c781273960c9d6a0f363e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0edafb25c7f9a3f62b24de39d10c6eae3dd8e6d837bc887fe3b9f6c330291f30"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c0ed90493b7a2b273b0e87ed4cfc19046acdc24533babe0211b7c054c3b289de"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "be09b1c501057fec80eedf51b3d87b2ff6531a14fff34e0f3d723c49128546ac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "247d0cb4119942684afda0e4203e2a1fb0c9be6625d775f586799b62ee742aa1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1da50cb1f8e3b16609be48df759e045c0100c3f3da4990c7f0b9b1643cd4c3bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "59623b0c47a2e8962fe2bb94fa611ae50d02ad34572154596f25a13818d5cbc0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5a79e38c2e585d3ba9e9cf900f9e58d52a552d44290fac75de6b87a7f70d5da4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "b731302c24a3666dba67a9075720e289169735bbb6466f352199b93a4e306bee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3b609896609f056feb9d7327a7a469f7c583301bc4ed20dc46ddde101f145c75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "3adc2e52f878532cdfeeb000d703f61361361f8524e28aebec2aa17a7e8dfa01"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "4b4bc67c6d463e5b368f253f21286f3d45305b702b5c29f5d1f374e3fe4ad991"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fb8855f2d36407e75195b47c6a973becea6f3ec95b2fb4191cae41915f95946d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "b74a6335dc835528d8c8c2832c41e0fa0a631d2bc5a483226f465e82c59e403a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2b687af13ac85fbef6c26bef425a214fd0b554df82ff00c2c4154c1d9fb6c9ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "cbe37f7e147438539f1e978f4d978b7ff157af563a829b0198c7c979f7687bec"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f5b7f3bbdf972cfe74c95bc64cc98adb6dbd23d0a0ea9daab98eaa3df3c84f89"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "50fbd7f64d6745686a273f576424ffc9a3efa8b5b9bed1951bc0f53e5dfdad3f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f93ee8e710cfb582865690867c2691c96572834c60ba55e24e42ad9e98b365a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "65a13b8c08996eac65a91aff0bcd0ac4ec27eeee713c9ad3551d6e023112c361"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2c86eda2150b248c86c20321218d4d30cb2460f57ff6a51752418d065b12de43"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0a6a3414c212eac65cdc77fbb424b9850cc5d274971bb1ba2cca45a2a6f28656"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3c33b2d4b2d38321a7492a2c690fba5fa73e3df6ba9c06cdf3ff4e0f2bd1d951"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1876e9d28235b266ee58627462b88040efab4fb6dedcf74b7f57df23b550a498"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "147cb598211a5d4fb90d7a1b88f5415989e09432be2bbfba220183d5e88540d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "80d37bc939a1160044e1a583a9131566c2ae2389d63cf49f2e9d6809804d3c89"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c865786fea6141651a650fd3e815d1200ef1b2a800ec06686461f1dbee40cb82"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2a5e59f9947d5e73c24bacc04f8f02286de51954fd4f4c051da2711694ad1c56"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0071a698d3b8fb379821efe7c672607002e47d5b2d33e4a39d286b61b8ebf1bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e46c7bfd0a8aae5900869125c85898fed6cc8e66724482d14d2186ba8ed01208"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "046481c7a549a5476b27c26c80491990afdbba7cddaf24ab213b51d2ec6b06a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "9b45d1bf5cafccb729ac8a0e9ccc0442f4b998c7c66f03694f1f554cb70d13f8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "36f3986de1a0f05c6947ef03c077bbb84d6b76d7d3bd6711e4c2b1b00cd4e1c7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "e3278691104c152379acc04915cc2ca343de07b8b89646784fa37f878c8bb23f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "dbf975f77e591d49804b549ab73ef71faab1fa0b6764588b10a6fdf33aeaad07"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e5d55518ea22ae8f99fd25a8bf914d219af44d3377018a650b4e25bb440416b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "5dff76a248850f21926fbbc4743ea97b117cc28c024a11eb597dfd89deacf09f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b35e2c4e28b17bfc776f3e9b1acecc78ac2dd292c0cdc62465bce3af2fe8827e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "cee1a9f7450d0b1223914549a33fcea0051f696b8a67246389462419e3b0f891"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "ef5fed8c784f9e0b1c7e33cf1deedafc3fa2d2802063d817f4928a8c0665d0c8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "f8b017983fcc8b8b083642efce6ccf303f9fa9aa7a456a3fc9c9d115c9e149b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "8e3bda5a1513219fd5803653c8fc983ee2972fbca557f2dff02f436959b531dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "bbbb711ba069dfd140fff4b8c04473a581e2d1e4f17694cb76d881ef7257187c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "939aa017d0c091cc410c0f88b3d2b838b73d930abb4128a77f356bef2f52cdb1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "03ca199656a7070d37475fdcb00a934e55352822a725f327ef9b7fd10aa66ae8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bb0780e1bc71ef95f4acca0b0aff0fe121ae4bdcd29e4620734d08830fd6ee21"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "21ede5e61705fb15a366dde9d277dc746ef61564117464f6cfd44d4839e12f13"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "807203c79853e0abbf51d102dd63401130dade11343cd7c664032e3e3b17fed5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "130f5ff9fa30195fa9021b573a2e5da8907e7f7c5a94a45c75e35e764888e5da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "26f2c18fd3dfeed595dd9440690913cb33c85f0c2da1d69f6983c64bcda00698"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b0c53dae2bb8e76e9096f8d397926c02aa30c9baf1fa4c5cf2f33c735d2b43ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "8e95a5dd8c2ab0721163ef513cd96ee665dcb468d353ba2ccd4950c70296d1ee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "6b8cbb5559b1bca1a3041ea70dd569cec4553efe04697a92fcdbd869280e8760"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "794d7a98027f6cdc5806c764ef66ee64442a6e9c323d65b6b2b07e97c9efba1f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "80aba4f3aec5ec7c63c98d191ab97484f5be70a75ea2c2aa3e9753bff76b310a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c1bc85572ca504d480d30b9209817e16a8fca97a276e1ac33b79c2c3bdc23af8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2bdd41ad578b530e1def5e00b5e414e7615bc8afb76afee38b8ff3f1757b6534"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d313c075bfdf92c6d6bb61157a27c3290b7eb0299fc3ed10237d551ed841bcfe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4202e7d752a0a776eecb416e622350ef7eab767d0c4e2cac8cd25c515300775a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0b9f53b680c7c08c0c74e5c04522f612d3a74a316f4b83835511c16346896156"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c059149a81deb0ad820e6ed77f98dea5d87860dc3365609d2d6675fe45d4fa2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "28e3e20a1ca8cf2865d92c8c3ae7d3c460d0d3819bc04596785c0c78578fe37d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4e00049e587c58e146b9048b92e98595b2c0e4a07b0f0f21a609b5d393329362"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "01b0c06fe4ae37cf2928d5abf81f6bac5a467a70340bfa5ebe924a040b9d53f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b3471b43a9b0a822b64cda739671156880ae39f3225c0fc886a0ad43712b4929"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2b0cf317efb1ce89239d8db078b15b68587821c0529d2a235571c4299578e010"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "fe57e17ef4a9db3918baa72a9df2b0113c92214742c5458d9cbe534b9bafdcea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fe5f4d800b23a9c4fea07fc8017e0acc3d37bd45035f48a2fcddefa206f66c1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b17b5904947449006c2455b675a3da85cf294be1d03006342ce2f0ea8d99199a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "79f5206bc620abc894bd53c2bb732aa90f6ef7df57edc0f4b5f7687baf7835a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c77e9870d85d8d4f8709246e66f36f6b18bd962d976c6feef5caec1f1ae6740b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "db9a2c00122cd18c8db12f1f35f4f087421757efa695197caf782b06f9ed7c52"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4f27ea4235d352e3ca4d6abae57b89072356b362a5aff5ad24804898ae45ec0b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "21fce85bf18719f3212d3d1c3cfc815b78aff385c85b36d90db6c6bca2b877c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "aa39cf0ce7d69baee094715b155d983f1cee5f2227f6420efad249a66950b732"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4bc2d3e8949595624d7f6f484160c5121433ea2e0f20684691e6b8d596b600af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "eee3f680b598e4856ab2dd273636543f514bffd28116431890f5244530e0b680"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "5d77929f3659d6fa7ee9f65346fac6340b3b44dd0431800ff305ceab1ce77b78"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "16f705e299d162d0ff12dd21180e173d50185bdadc9c1b18f4e8311b084f5ed9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2b62e5a8261542eeb06af1475a914e670c5dbd62b481e3fb71283a476702c327"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ede77ae1bf1ee5a1df65fe6bbdd53d37e730716e102554b4ca902f0616ce6fce"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1b2baa6ac764b32a0a213754fa502d7c609eed94cdd79bf6f3d7a5c9fb67ded3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8e43a1668a79a023bcdc24c79a17a5668cd0851c58c63521a1907ea6b4b08e27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1171d9c7e20c1f9bfe268b9c697a065e4e0dfb131d38ccc0bbe77f3e7e8a489e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "1338ed9ddc2ae62c9b8e5af582fc635e8aac6cddd9ebe2e564ca6ef0db8a7aef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "818be6e569751426dbc2e2143d8a73a096c0bc4302a12720f03ed0ba005d529c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1316928dff4f0027f8db5387873b2a6d7be54c1a40be9884494b1b6d7d116bbf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8bebdbe303ad6e43f70bba7c66bd13fcb2bab60124fd12259a27bd04837b6808"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a75eb4a18dd69bd63f70e7b067d11650a9900d57fbbd8d78d3f5fddb68d762d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "dea350a87a2a8f247692d250d934532a1d09dd1750620a34336738df89f1815b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e21f898b5654a6710ce6fb693a5bde47255f625ffc4ba385b2f2ae47ef5a0ffd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f94a8f3ac31a3b8315bebebd9a01daa04efa58dd8fb999a0135f9b08690d0658"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "c820ab53e328d9a1bead97ea0d62a83d83df22bf34a3152251ad543ea826ae5b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "753b73114972e4b96166cd46e2410489baf6b1fe1bc796f192154298f92d2646"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "f4f1ff9f51cd2b6559c50356e212a1b002d54f8e3fafaf5ca358a2b7ec1fb584"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7ae7d0ce89b57a04add9ab5e5a25ea2e204916d8deac25e6bf69db717274d380"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "127bf0a765369b103d3fbda374c94500777984204c19d3237adc5e7024179391"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "382d98836bf06b9d8749fcdd95814f241c9ea600e95ba17c403fc42a735c69e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f1edf7a596808f28a8f0135e7bd0631c297c4453549eda2546d02a6f2ee70466"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "19e1e67e6b75c7fe81e181478b3bacc72775ae3a7a6aade3cafccb2e549f9e4c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3024d274f1d34997cdec83caeb3a67a9d7b7b409c1056f5a4fdb6888c0b353b4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "277431b07e264b08c91cc49c03a559309a30d448059da14e0cfeaf06a470b4e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "d2c7326be4e9c14707d25f294d0b761fc5ec63acbaadff53459bad8b6d4c63e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e9f596f45360085bb7b5daa8263fd00f1eea30b573b9198ef67ff01733757d10"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2b6500b62cbeee9ed899efd411c676786fbdac9e1ec2d097ae4444e4bbbaa455"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9645ee469dadbc6266ce66afb3ed50a098f691fb6ff766c0f39f49422b59597f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "05b50f88e583146eaaeccfce79bc03b7bebc704311946c5eb5ba7737cec655e2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "759cfc74d3ecdc97bca6e27999c4c9df424322c6673cbaacd01eeb45f2f58759"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "60bf85feb725e52ac1325eb35071d48a1a5c86a5656070716c2cd9511c4ed869"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "b9a75b7db734128a4260bb2cd017b787a08ce3e339d6a9612401549231842586"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3e79b1e7f0e7dddad01116c5dc0009ae2d4f5aa87b5c9e8c5a1015b5b3c9bb51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f52d367b59b9bfd0723341f01bfb540c03c7b4dc5d494b72cbb1d0c969f66462"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "6864110c9e43761861b7c4f28eab883b45e7d6450fbd407c953eca0a92fa6793"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "683e09776e1bf6c820c7db004dc833ddf1d3835247ce11ad1920b82a4978b66d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88e8fd790e18f77755ce51f1ce59d8d7f63a9f07814bae85fbefd4bd34e95546"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "1efbe2d7635606b844eaab08ba19e61190fa3271c0ed77e2d2787841cdc52dbd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "dbc8e45d8e7d9974f6696f1afb77a043af4cd134e0ad5682ec89e6bb47abaa33"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e49aaecf723700c1b15d9d679fe9bceac927805691635922f4ea407b209b6ad9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "47af3254e4ddf6db7f42694c8bbbffed6998659e87527460f9646c714d4bd9ed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a0fd296f7ea7e743d8b090371dbbe9cf28a00614bd6d0d1607b4ecb80cd3148a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a5bf0563631360660e370f06dfcab8b7b33479863ff76e8350af3feca99f3287"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "697f926069c27adc669b429d0024a0e779a81de4f79aefc7b50a780a9dd2d144"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "f0cdb4c71b6a1c0d9b84fdd419d7df3f51ec997f1d7428237d4a43a6c1847826"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c81405bda5832155d838db2200b1dca3125048f85969e849d9711caeee3a1c62"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c43b7523b32d781ccc40c7f43496a11dcdc200ca506017635ffa95ce440b4e0a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "175c2bc4c52519a19f97a04fd549a15d72dd498e1974f2bc0b6a733019331b2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "cfc315b2dce06e4530fe33cb03e4243cd4721118139a247d0000a58e8e9bebf9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "0660c14b7eb4839b113628e6156477845c783b87b15ce5d27e4ff878a4f2ebf0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "d1eb6faa2a4558d582cf980c62b424164c85cb9408fc4a6efaf95fb913016714"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "a05a44375fe8a29fc5d9dbccb7677ba9fbe0efe5531351d10e049ab5c7ebc2fe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "afa9367c3268b444c4fd2ef89d18202027d8c6a26d76e7cc54cbd5e6cff035f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e8a801376716fcf6aea3383403397a26e40006201d1f24bfb958ebcb89662b15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "2d178922a19dc479a4bb91aac064aa7b565fb9efc93f038a35491ac7a29d0da2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "094dac4d344f93dd59572d147199e12cd7cb72889cc85eaf6563d8db3ba60f3b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "4ba1e3b60cbeb39611c0691910db32a65018e2bf13060dcbeff223e955b195b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "46a0a5da461607999bd030a4cc6a019b8fe5e59f8edd21c7a066c1d556cb21b6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5969c0b9e6aedaa2f5adc9883b4fd58e543f315b48ebbc8f379d41b86b1b8252"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "6bde444a74abe45a64668e16d9fdaad5212098a78b31ee70719c1382a10f5c2b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "6720be0df89e1fbcd637593a1cffbc8eecc935e6b958719d9b5309f3dc0420ea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "692ae2c0532556c70705af9583ec3a88d11415de8606580e8751d261c56d0963"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "77a609e53aed84cc1c01fb1037160e2261f4b1f4b251b24c2a1139778d9ce9ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "8e61e9763737db6abb88eb1ddc9aff5393637d0cf2b184af65d8154545c7bd74"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "04434f7f849f293e609d15f8efe56e9daf2811661ddc7b2234fb3b69d44bb6a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a6279bd2cca629e25eb975082b9317c4f48b38238ea57a73186bddb709ca8452"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a02655dab8bfbdca6565bf0d84b793bafd23cc2582f764c3fe57d4e95a685479"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "085631508091f8b5d0527af168a369bd23715ed514e9953f2cdb72363786bd2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cf335ebc091ea120891c1f3e08d6ecb003c87356e608eada590678ac561df3f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0e18203e03c38b429b9189cf2c6076469b40c74d2675f0ab7cedbbb78c905313"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a0ae3256b54ce9b7c60136d0beaa1ee0ccef3905cb388db2ed9e28f7a17cd22d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "8374df8cb128bc5d010e0e43a70af22492dc329b10d62b9c1c137e3f0821236f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f7cf73f659983af72dbea61eefbd040ef153bf9bb0679a3d1ac33518e0ce2917"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "70bb2f57cd2efd3338fbbdb17610f1a7dee066dca38571bf604564962c066f17"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "44002984b3487826e2ba35aa91ae025e3c3299facc047985dacf4fa1f0ccc919"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "4c067c4e9ad758dc51a10b90c7c210e0d06a31f8b516a56dede9034d1e354ffe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b95054343963c703ed6055f16d9f3c39c015fb68f9f5ca57fb55d2c5f2eabb67"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "89b11cfff7cb6cf109caf757925b0d9d1488090d239c1b9ea610ae6a298bc0a3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "443db7c4772cef999bd0b34e97b9f9cb7dbd10f64b2c40b0b4af1e9ca7dc5e76"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3e0edfad24226ed4a1b876b4a49e14a996769e5e63804943c4048736b40a17d9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3a0f6a219ba812c0b33338e7abd247154e3c2e16657aabb6ea247414e54cbbc0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "35643782a477611634024766061f1c34b481886d12b872af502c4df11447fc6e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "27e723edeb2c5b3d2ca782944720346e42766b36a3856f6594d79ffb606c676c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "9987096f28383af2b6058994ce2d5058d8bb1e11b2f7bc2ef4434e70ce9d35d8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "953419175ddc9e5aa8af6cc18e69519d229cf59c9cdcebd99cb6d7ec5d2742c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fb48b5adbb371b7ef0790a3b15be815888a59f0f10a68db0bfae871967201f9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "cb47cb4763daa7e309c1fbc3f5b58191408b61519b6e3d4010205b2af2df0910"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "0a659ad765284cbbb7e1026308446677e815959974c2eaf610d22c5838d8c139"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "d09e3a1fc09e2ff81f252bfe543ddd50c6993022d0793f80a6d5b7bd2da9c0ed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "50d737d582663a4cd060396ada173ff1a09350f94b7ede5ebcd960b89b65128e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "427f3a74026e9f3efb9c99267bf3de0b9d740a9e7016732b4bb7a0607f5e9b87"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8601bd3db445967017b396176b4764140b8fcbce6ac3a721e3ff869b96bc5971"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "eaa544b3b66665cd162d399d11af7e3c89b5043520c8e523457396437263c456"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c81069e7f9e8845c4e8fa34dc5a3a8f4db8a25d05804d44ae71023a9aeb0394f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "beca22c309088755c664ee916182cbc0397492b679f2e39749c880e2ea2df844"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6b50bfa23a2d505c51a260adfcb46ffc8022aa13c6dcab115a7ed8e6be85c96d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "24c9fa8d9ef2235b6f76acf8e2a6d5cd7d13c4e4156c69b5ddc1aef45eefe42f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "d9a4e9d8859139d4534c165cf2b54f10fb33b0040a03acfd02d9bd7ca964a3e3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "e4401d53907090bf26cecd6dd43e4e086e15472eae12e413e54626f8b8543a72"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "324a6d1378151d904d5c536dec64aff45cc6e7364449456ed3acc4bc9957423d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "c00cd78883d17b8f6955ef4a085448205434a1cee0a57bee5848e64849c18d8c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "316dae9bd4e97be1db9262a55848b4a7be51d75acea138a57654bddab6ee50bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "15ad1d9099d2639e30058ef0547c5264d9740c72d457be3fe61c01ddbc466143"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4ca9e0e82f01db5904d4a4f42bb97e113b9f6f3e25fcc93670f4a235f05d8bc3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "941e50594eced172457b81b0558375f216ca8ed52e38d1d01a0277e8dd33f862"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "83dd62e7b139c646284e1f02822716ea668a652abfcf481520267270908038a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "99a07de3a573c0d13fa1c988831e89920cce98568dd3f8a9dda1cd44f531e28f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "aeba8d8e70d7a5831af536e1ba6079403986e3604c75832491687fb246f4c91e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "be210cb25bdbd46ce99555d444a4db88303ec8c388897287d68fd14813d035b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "ce52894d104591b83dc04e7dae747a4e241fb8312c67a1b1b21c55fe460e2caf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "ef2c185f82183d1656be21e4ca66708a8915967f64d16941d5a8918b92d56c13"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e43fe55166faa22face8ecf770c824229cdf76019eb32a8fdfddf6d66eea87ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "bb51be28d839c8f6886609c115114342dbeb3b4022cd540332c459267aa7905e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "2738c1a8080fd9121c065a23921389020a7b416b392e6b85b7f37e8f22e9dd01"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "d37dd1bcc3c7b6bee7105c4ffb0bdfa1acfebdf9435f075bb439089d1e7d44f8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "6beecb7b05df3647530e987a7fd28ba646face9a9e1481098fb2d628e384250a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "fc41aedca2b234797e537a0df1b8b5f787002299c2b23e6fd3fd28c5be4ab204"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a0e3490a9d0a6bd551f8fa6ef1c3ffcfd3c769b59bb91167ad13faf0bc3d3688"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c0f47210035aaf2f9f8d0ce3d3d4e6d5feb3d8ce6ea272dd03d7b04ae28b764b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "c21cf7e0a34bfa5099583eea10b6f8ab59e965d81e9d99c7940871acb97d27c4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "23fc31e8db085ebf0431d001f1f4035a3ce983ca1b71ef262e71d71b26f8968b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a5d3b697e393f8c3d2077db6067946130a6abd8b538bdcdee65ce562b0f93242"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "c5ae1676d4a69ca94943abfef90fdbe1d703c690501367eac4f96f10ff326bd1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d4801911f3b74eef1a279c2f1d7985fbe3d327a989844b403eecaa9076345a07"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "431e6b546e4fe2b0735547ff7f3faf8c04777001cdf75c87a609f7c72f2048f8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4ee79ea8129306f5c5429704cea0753c3f62bee967f3028d946294fdfbd52e68"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f0fc31fdbcea8bf73b3b011597d3d5d1bd2dce8b0f30b2d4ca52d9cd64d0a5da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4b79660567c5e062670031e5750830a832984d59159cb6faf16c94e926fa5996"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "835bbddd7cbb73e69d1dc384fa1ad70b5961c6e93caf570cc34bd7a71462cad9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f24583e68b39666888cf5d49923dc7c850c867991171ed327ad04979eb3b2b6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "560ea49625ab161e9005ddf0075ca2cd863273c7a8dd48a79f0ae0629cae091b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "d6d2a0ee240a229f3f0515ac66489ebf162d03c80bdd2035e8223e28e00910c3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b0d5a1bbacb2a601b3e5760a158897d41f066aca674d6a68a1a1d1bada1fa3dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "ee74dfd3d91aecd13089ec0e15445d6d198fbe54e627189d90adb04a4180fa9d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bc3b77c5fd1b71897efb09b25f4a8a7ae96d12a7b1f9f9fdc2eafe7cff900694"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "40c36f214d8558ac8a6e4a67f87e8ce2dca7fc6f1323fcf9527dc68cc931a358"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "833e8d90357dcbccfaee14a719e76079cb472bede51d01a2ee6b31b29734231c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "5bb472ceeaac3fb05dd8d40fab23a7da83984b8e77dc16c691f0a124bf4d90d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6416bb725afac4353f57214bc2c49edd76636dafb87638f18a3c378097cd7a02"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "0164d6aa8faa8247653cfdf5d0352580f68b3268c111387ccaea2b8e789896b8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd206d12a115ce9666635d9b0193b05a4b4183a641378b2b37ca1f08b40fb872"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "7874b9e358e9df71bc046aac4c514bcbdeb80707dff0d51a779a6508b9c187f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b0a7a2e61d2f1a7e75a679b96a230fbb6d736bbbb450037601e477da80b5f5d5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a8ec2195ba70f8c4739fe776920dd0289e748b6571f7b972dcaf801e1e5e0ae0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3194491b6b9804c70e2f6af262391325c646b7bc92c48b46e321baf704e05fae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8578d675f92487b9db8de6248f5b4c69106e10d5f52864bb260de44e942c9caf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "716d5e10145a4044ff8060c5d1fa550d132326314bb741ca7df3c428ace72ddc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "58fd6b496afb9954fef093faf991e193385dc54d16e93dcb3bd95547c35677c7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "4dad32e85fbe4fe484d8b8a7e43cf93b26f189170b50165f31a60f0c280eee0f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d281b2f87d360aae0264cc28ac2c783d8c4860af96f0a18d32cdf83be9e5a900"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "59a3bbbfab9493494a7e27dcacbf5301b6992b0e52579ad8a1222f3ff6060955"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "95c2999829134b5924d75da6fb136b7c3adeefdb74e168f177f78e758ec86102"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "76e78c6a16aa11eaf1c23d4e0e3d65d8b97ed8a80bd37bdf895fafcc9267c864"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "422e5ba78c04a678a56a6bb25dafb1c6c989948d01a3c20a5ed3ad8e6a646dbb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3273c932519be7b44f1b951f90b1e8ffea147b3bc90ffe6f8efb5f95ef5f5d7b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5b3f56077ad3f52f503d795f716a062e9f3ca0fb7fc5cac1fdf871be7b4803db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bfb2cf43114fbf6ed166d5c4fbc945c34fb42aa1c2fc6aef6aa58ff402287ccb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "92ba3838c0e80d405dcfbbf8cc076a53a5b0245e66db591a872041f90b3486a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "6da3b8ba30f8346857021a2de778c7c50fbb6fab34bc2924e7e951943891370b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "9eb3b9c1aa4c3e0b0f02a1df92f9717ffe81f50d325e7a07c8d87175c3ae684d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "3fe70b95ca042f4f1ef60682dd21d07838e0b3610d369a70c344934eda8c41e5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "71b19fe60b9dfbf3dfecee6aa3f3cea6a54baba809cc891395b8b79afa59b03a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "e8c0bdc87e9059464093a0406b84347fe096c807a12edd2a99b5bef98e10da05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "016480abe9738869a550c735102a7ab1a30c309e2fb779650aaddaaefd74a52d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "5dcfdd1ff324b3222793a9f7a9208f19c5f7bc9707fc625536b76b17629f129d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "c9a67f62184382a97288f8c859b560cc23153320352146d185a72efea4f49de1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "c616ba60658581eb371472cc5186e442f48142d071248198dfa632fd62a5eda6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "1511e8e9bb6d33c3d1579f3781e9fcd79daa6531c6fb9e1fd9a136311b6a4a56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "7df38f14dca49f547f01c238344ccca8a98f24c8279df0201a47e44e8192573c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "854acdcae4b3c0d651f2af71c84eae63be1a44af9bc92fff0e67fc62b17936f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "fc7a8c3171ba1b5c31de122f31738b8f4a50cb8aae63364a838154486c34137e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "dc49e5ac658cf0642e044722309397981f31627ad22368e307848a12fcf60034"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "a0accf802c14cc8b17e88670b64e54a3142c768e6b0a6bdf15eb7dd831cebe03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "bf77e551e83a4e2941765d645a1d3913a53732e2cc0e1e881fdb4d8e702e114b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a6ec500f0a2ccbacd2fef1c396fbc54ba79059a92936efbaf33f9abb4ee0958"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "007932793b1d5b6adf62dd896e52ae320f2b3e0b74c3a1aed67e1265d36f00df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "56b565ce6d1521cb87b7edc587489064a4a03b6f62d38543cdb6600666dbdb32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "dc26fb3f030ce3552eb2af74d15866cc2893699d4344c1ccd143ac066b0620e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8087818a17c47131ffbea2123ae5a8fc14053aa34cf788ab3c6f24d70d85fef0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "78f537bcf7fb0d374ced486ceacceeb399bf2181f0df2b7a196e0aaff4f35501"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7f78e318061c8b783177e9d23a9cfb9a0ab84fc8751f6ccc4625510cc08ae9be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "fb6c64d85e946c3f0142bcc7a977f2ca8cba54ad72d63bc56d75f1577948465f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "455969b9b20c44bcdfc601aa32d4aa2bad6c65d6004470edd2bc6889568fcec3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7af88e1c5982a79e1477bb14e3ac55b858dfc7f1962355b3402a5b7632843632"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "64255add7c035e541b415f13cc072b23b14531fad967914fd602d68e03da460d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a3207443c61baaef52406cf85ed3c9896dea3e3212cd625511e8bed6d3d92f55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "4349ace32a6e99f7638442c2cbc642aa740b7bc4599876e1cef74ba7c356ae8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6c456c541b9b8bc0231c509614123b447aef76eab60e2933ca63498225ae0751"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d25d3e0cbc6d9e9a323085b5d238b095bac5bb2f390a1b912a8ceb49de30c330"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e458b191d6e2496553334ebe43cdacde01dc95d8f1f13478c208901bcaa0834e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "c6b797c19ad92439cd1a4c8d3d844cd389b41303874fb5e96de5a6b46cb2e568"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6b35a6d7c866a483b99a66baec5684b00941987cb49af49e1d613e909364988d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8538f5c03fce4117c09cc4bd323a1c9361e9ab21bfee7e844ae3c9447fada728"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5f015198fce4731158127fa641cd72bf2bad7674d3d65e5d568b523c0058ee38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "c72e0383ce3d43a7b66e6577ad212e7f7c97f6b66b48478bf26739d2fbf0ea4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "812d660c498a50e75c70c0c9c8b1419488e48c6c3ba096fe38555e64e8a521e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "96d0b942580095826eee683f6c9a184f8ae9e67e9257e4901f878e71162c2a13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "47ee2efcc7b18b7b5bb0216b930193a17154ef6c33d147c6fb64121f6b4bcc87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d22f8354830fdfd4d8390727efeafdfda47eedc9a2d5640c1011c21b03b420c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "9e392161f711083904f2d9514a9cf1f087e68bfc2e529f8998f90134c14b7a5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "111912d97906e2dce4011f070cb2b39601a5286ca0e6026235c6fd3f40215036"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "ab6a1f2ff9b3f432b39c626485898e8c27a1ee771ad056a300083e8031e887d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "18bb5068821c9544a80929955cd0102868fe289141db6bdbfa863a47170f3f62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "45d867f7b6e871591ce12d8cd431aeca024a519a5115213c3659a79331da508d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a6ceaf8f2a99127cd846458c4043ccf000c2cde7d45f60561a59c1634b447b4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "efc3b5b4024c58face414ebcfd0c3343a9f13de08f2e0b483802cd397570d45d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "61b4bd11da89b1cff460cbcf3ffdf125da09738e1d2b8635dbba02a981a80b1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "4ea1f5726ada8fe473cd9eb05f5f5df9fc277d28a5a24aad313ccb9fa2d52f92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "55e6175e1b2860225521062d765269c3baf7a1412c60ea4ec8c1d7fc3e505e98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b78a3717dea99d90fedc6a8abc77d0ceba99d769b338260f26c1807195e6abee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2b44d6f3e363ca9d45fae830422b3969e08aaa9b43c97b8c97b1b937b61dfa81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3d9a5a48c67e886580fd58b49322696e37196808e635ea1b4e134a8b4a202905"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4cd1831f36f4f8d7cbfd3d0274f26f40fb837c161f06aa200f177aa2c43b96d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6932515c43a8790792d9bb5eed2766278e7aff34d43afada16188db87f8cc7a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3a1ad9753348966dfdf7c7d6290e90175728df1dc0e8c9b743ff015b5a55402e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "772169906011fdfcd7fac9fb7131d928f6432c1b0d424954f1e2b97c8de8ae83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3e53ba3cb4bb5604d1b98d4947a96f060d04a8fbedbaaacdbfde5eb482cb6826"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "7399e8a7e2de6eae182e447fdab8cc4302727c1b3f0269abca908704cb7662da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "31623824824e3ae5b20ed82f53b265c6aca6c46ddc13ef7d14b6ae20eb661c1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "ee18197560978c80149bf0fafae4c6b9b3a3d9b1672aca8b2cac09c49b647834"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a0221752df029fb863f506bea5a959673b39328cd0499dfffa1069474c97eb99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6009a1e61fca6de3406b3e97f6980df5406321237046866aaae65009d9729788"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "a5bdaba3eeb9bb097a8974d8ee9ca2d6cdba815ac9733ecf5eccf192bda2bd0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "f2005b3159b4a341d4d7d8a955621e8d4fad4781f41af7279dd93259c89cd409"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "68510b547744e583cd84763d6cf2882365c404eec336aa14b7fb18feba2b0316"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fe6f16cccf27935fd265df62e0031b33fd380e8a38f1fad136463b96bb50d01c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e30f3023602e9ca3650ac9a85927ba020727e15317c5d74488499ec6f02f739d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "da42cf751eb12d3204c9a6161987796c4fa3893cd97a6d1c365258c825775fb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "7a07a4ce96cda87b26132b8169eee7ea4dcced32968780fefd660e8a34e65698"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d6e894bcaaabe54a7c610f5492237de1146378399dd338ccacdfcfc34d63107e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "3c1e246b8932e7f9f0e728abf2b6781e5891ed342f47ee1f3dba96b1ebefda40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a0ed3e278c988fcc1c9fc204fd36fb6cdf7953bf6da7929b5227b2f68b722d28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "701ed326b5b6ac81915ffd8ae4ecabae1bc0909a80d1ec952a67dbd4ada5a893"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "841b083a41b53b7e3cc1d61d3f0c7f6435be5afdaee6badc94a5de57478e6ae4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c7088e5d807d94e00617662b6a12a2bd3dcf64ce808afa53e846a6c1b26009cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "63408df45ac7b12d6e34f4f293f5122c4afbf951f51f65a3c8a9c42bd58607d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2fd2d3d8768f537428ae2edc614b1612656b0b3b58200cd05d2cd19f1bfdcc5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "4fa2a200270964ac5066f9deed0c8361ddadc8fb6d7beb677b160dc0f2108356"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "98429fc9e856ef519cba48be026b823c4c55c0a92f2606cb26cca330ed88514d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "b1d788a0765f8f2ca5ff2b1c128498d1bfaced26ab793add44c45e1b2c7b357a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "4a5a271ac2b8cf1b8c3cf82eaa874a320fc263f814d8f808fc70867fa176bd24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "646a65e01dbd488ba3c63ebdf52c2ffd48d4626af2bf111bdaabd93207d4df81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b24dc21ad571651b0ee8a830a2deab08faf11af80030df456b50f2fa9dcf1efd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "60405e8ab9d66c3d16cc4cf3719f483e5467a34fe9b4034c74bdf12b9461528d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c5ba3db03f0c1f723ea0258337c8ec615cd2099f0cda2a3aa54fa6c3d991e382"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "feb87a6e308701407c51ea464f4fe1f076d564c546812051a59db8788523aba7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "de6eb29be48cbf3a69eb0a49259fe668225922115daf1f69c5d6fe28eabedb89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "92ffbefd8fd6b0287aae78550957f0d688e1c6baa8153b90652b14f65f545217"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4f7d2c6f723baf7a9b0f68e6caa2e7b32d0ec8e6831d1c02086a160a95e5d89f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0bbf50d17fb58bacf98d0d9977822f8603e781eeb1853202b87dc91522f15801"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5b79e2bc01960faf161d9c8a8bdf748b4d1cd2d2631d1ee08fd7940da319902d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c56f2bfe48a95d06dce3b494f2f611e566bd353187fdaf54c24b5915150c7682"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ec4a7734ee234165ad0de1feef98732d9d09d05f04b4e725ebfd7dd312ad446"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "19c3be57e55dd359452c0967a28b276033bf1c319e39bb1420fde01ea73ce31a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e9972ef39342938429a996868fc2f3e17bf3e9fc07f63ecef12a77ba8d333de4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "37866d288361cb8e3704097dd05d9e1eb792de3ea223d4e1b1afcb41959721a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "9d4c76d8fd6c425d611c2cb27e703ef8b5edf0c1d0582beee79879430827593f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c55973ac5d52eff1f8b559f7bfc5b8412d32f5bad696f9e3dd7dd732e4852de1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d3c3f88903c271c6600aab7800d42003b57d73f60b37c01b6484abbb7ff6a313"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "2dad2e5333939d5136e39703e6acba68be5e3ed02e54335901ec56effc78f257"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3651bf6f4dcad65f8be618f78218b6f74c2df66c27c8befcc5be7cfeef42d989"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "072bbdcd4b5c31b754d125d202edb8249044e91830c1498ff7383e8886bbcfe7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0f403f443af2431b5848f28c5d2be3808a8dc368ae6520a10e5dc0b44b07cdcb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3182396f02cd2ffc4cceaa0a70c07f997a74d67b6b3328eac4fa78ef7b26de59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d54cf13223b10dc3a7edfa06668cdb0b8be76e56785dacfbf75a4d962122d9bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "ee462c57b76ad7f52b4235887802326a680e566fa73e114bd47d9d0090882e00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fcd37f7c6236fc302cf2fd16b10144829f80d1fd8718a027d36e361afb5a3ff2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b99a1087e1b7c058d343bf235c8f6fe2b853e1dd41c2b2744eb41755aea2c119"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "b197b11f37a3311594c5dc1e5729d975be2854bca587435fe9148f83c8df4616"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "dddb3a0b8a608827d7c2472ae617a7d58b9965c0d5c0eb6ac2201ab0c1d0ee58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "20bd6cb0deec331c81212e9ab7474c075db10b72766b4a0399b4d57815409765"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bae2ce0bceebfcb3bbea5117f73709e9feef3dbdceb71c85629d5c523bb62e42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7db9e58c38b448865c730e5740248895d71f55c84bad845b7e6678ba9cf1d8f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9039a7982dcc7a9d1035211902da0904b061ebdc2e8cee4877e6b351fa051837"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5f175b3701276dde1af21909d3b3c6166a20622728804f2de8de55b41e70cab8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f3cd36224d70cc26934b22a7e6e78461abacd9f0604a453a4e7ec697f192ad10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "05cbe1ae8d7c500cd2d87bbe54be7d9c747843d2900d2867d0f0405a9f62d891"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "dcb04b7e71196b22be2963c484fa141d580bc7a0ea52ab304f73ea3902f4c348"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "2f8d7c7a60c02a06281a46dff2e5d073f693df866759cec1e635c3cd719455ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "fdc7f2f4bf8d4ef946ac400266e340a99e8c101a98ce31880f377f674ac12752"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "55c94253060442ed04a0d3fdcfada3cd8cd808e44e0e780024bfc48559ee3cf9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "14c180e5a022fade0a58958b275d732d02f093c54e9f97f912571280a9c8b01c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "b2f81b5b42ef46715a122a82d84de6a4cf92c2aa7443ebfa3f4cab2ed54327c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "47ce44a8669e2a7ca9fc31a48ed87ae2f2699f8fd96314e681a32cfd25fe217f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a8312c1b7676bcfb5ef1d774c911ebcca51d64deb0937e476f83b4bd9f895531"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0b491f09f132dfc6ae53a8e81c7461865bf36b8c8ecf0fb2120091fa2c56ce96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "aad7fa5dcc449359d4dd079552f7e4d667d3d96ccdc8fb694f0a60cc7cd1feac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b583791ccff1f249c0c83ba32e92fb67dccc9fadb41463eab4d3b5e8c17b41f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef6e859d3f80c08513b173b204720132155593abd2a8ab81dddf02b3619a38c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "fb20f53bf763202c1d485f1488c79943d9c69f2326067cc07d6ec9dc6220a4a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6e31806cad5e7f5604a648fd357e99c76fffc59813817f95fe5a710c7fd4e8ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "bae0aa5a1142320fb5fca5d8f452bddd7b635e2f87baba589bfa1f3367024a56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "12a4c29dfe7c01581a6e587ddf3ec3ce965a3cd850b89094adcba2ee155afb96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a0b18282a1ca6fb739d676626fbed0f8bb55bf6cc2ebbfe93c94db132e9c8af8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6def0d50f19b2f1105be0081f05e2cb33d93c7244b097b30f4b06cbb61146e67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "03dcab2d211d2ee86007fd736ed51522dbeeef49755530f4ee1d27965b2326b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8a0b2b167104390c23539d847230c4779b40c7f65eec4408821bfafa0b8e6781"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "59a84caa53ff28c355a3bd5a4b18dfdf5b521b202a5e2de6829c1ad550a46a9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "936da7d57b46582d0244f74f67d493c8fb55afc73b6f4f71bbebd033df9e6d22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5f93628e45e30518847b5c3b3b90ecdbac9f89e5a627503c3ef4fd3084fca2e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "64728e41e867677310442222921b10eaa6d7ea69164c015ac2c7521a26cd6b35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "c54e3efd1d9bea58090a4399a8d49141ed6aedb0a5db1f4e2b908397a6a83369"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b8eb35b65821d9e7d27167a013ee98cb19e41b92cee3bc460d6b26fac367b3ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "92a3daa56ee64e471411bb8e7d5ebcc95ef37610f2656f3fa79787e48753d904"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "6008ff5e6ff90cbadeb6c6ad8a58cff42c365238bc57b0eb4f5e34678f4e739e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ec666a46dd0e85a7141d4bceef7cb7d49a2723f69964a5387b558c16a09d3b66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "efe1682bda26700781b470a943541d6b0a8dc816d79d283342f75a51049b437e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b0238c325551e8542046878e3849d7cc55a5e2928198a8e4182aba8a733e3214"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "db91d843cca016c7b3a6ce7848d23306cf74cbe84705b0efe5b4cfec0c1c21ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fad8affe33e80c65713543eca1c1a0526820be24368c4256b85150b49cb2b73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "93c6f93e349bad2f62bdb1d8b7e269fd2966bbddb5c806380f62b8c1639f9caa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "5f9d48e5f6bdecdd997fca5c40e90fa61088fcaa3173a8e245ce33144de5a53e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "e15de83190e190c2766ae241e223d202f4320bb31824b8edffb8ba464626a8ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4be24517715ec7c5bf9c06fbde66fea20d023c5adc4ccea86df004f37c3edc05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "44481d2273f7a6b7a5bd8161c91cfd1c47cdd077a22069f5a64d711f872a2544"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "30f787700cee3cf764a6071c6c5a3799a8eec3077217bf2e1a6b090e8c497a4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ff11ae41247c4f9c8c9665987cbd9263b380f59527dbfda1b67d6b42524d4278"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "ab7e6cd8ad4d6e651f4a0d7556b4c45f09aaa56dff09d33556170ffd1161cc1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "59b4a1eb4d802abb700c74380d2544f7eb46eb6a36544575e96870f5b839df99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "661c2fee03953f4c4d900c9b43309822434314d748379cf814783b3c60f2068e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "d5073f4dd622f572c9b56ad566b3f140e413a7e58c2a9fc096aca78d28356e43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "ece8266f96e977a4f20ddf79812632923d636e35be487d30505ed4f034b578dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "1eb4a120b40368b28d68fdf3a543ed7221712060676c0338827b0e68bc892641"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "7b4e2487c6010afffb8c44ab1afa4a5bb81c1df4256dbb85495189abdebe5134"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "9ed9768ed56d860580a465862dd3cafbe92ab6847a5b2e1f668f9bab0d8c9dac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "eb5e0438004001f2a3e4066792db4c9a4adfaf5909609ce805fb37c4d0021771"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "912766fb468111f7842eb946d77a01c01111265d82f18e27fdf4b855806c60b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "d4a76299007536dab31c9ab01ed19967b3db1fa0e9602d54c511caf9f0bc0678"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "bd94ff92cf0522dfa390314491dce1f1c01db57a5c10ff285846af0aad722f41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b7a938d51c6b403552d97a5b08fdb9b7affd4cef853aa5bd6f24362d104cd956"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9d918982cb886de9b9cc4d3e9c4f4352dc55e4ccc4dd6833ac6696c03f0b32d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "6bc1070a81603091b5916a87439a973bbda1cc76d09b3df786a226ec4a031b8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "9b34d0584bfc981e893a41efe1aa5bd33bacdc154db5f731c5a81094ee46f994"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "63b66a4a9e8c9e1a6074eb8860ab13fd6fd4380cdbf9199f766fb9416fe9fe17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a6bafee8bd7ee6cd9daf29b9d4ab5c4075a210a65ec15c44f2d251634f563128"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d9da6bf31c545ebcbdf0983b93a4100f8eebe2e59a114a3c40d92930e25ba750"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2617509ae870202ae456a1894923656aeb64694c2f9364de08d43d6e17b8c0e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4709737f0738e7c74836343cfe7e7b9287dc9778584f5198455e46144d6f8d47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a3365f584931226d4c5153d70da6755d25539d54358e702fe19c261c72362394"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "3361232b68a9cda5736680e4e726b266c3889991a0c9739d7b599cfa17e7fb79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5b3aeb985905e62e3ba766a5a7d9a477d73d93b998542f65741cd8b235ea9f4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "9b3d7a8492a2e7e014a4d130b5e43ee2db3b9c5efc7832273ccd78f287d39da9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "72dc4257b716c7b2442cc0ea517ea645556f75bfad5653117682075b35226b52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "aa2d56e5c251427c32edcab60e0cec579ce08f98bddc081b88e89dc75e492008"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "5b2d760bd2590e362fc3c37809afc2dd81b4f86c5478a4279cb9f8cf837da215"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a0f129b3adeea972033bb604cc82b4402e5a224af5fbda3618a71b93ee992837"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7ec8328efd23f0ba6ae8a89e482c42dce09ff40fbeaeb3a9d6db7adf6dc07bd6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "592be1c539b3d221f82f474f44f00a4f37580774380c06a4c719c835e5198032"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b210fff426f228451b1d7389ded154239a406d63fcce98bf5eb5ed39662a4e11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d7cba359499943a46967eb6c15c852a0d94c6a32b53d3d27245e8e5f2b5de464"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8b8c4053d16abcdc55a005659b5a0ba86f14165b8562da5eed72b2b9e3ac3f23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "bbf7f0c667d57f6e79697452f14a2157b08d0b62bd75c8043abaeb3da4bcf499"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "0e676334c37660fbcc6b6c3271b98b95b41ad89fb8909f88b710ccf012a55749"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "58c57161c1e22e90b9a3240511de435f2230da5f0ab7aa44d9bd3c324c161b2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "7494a0700ac954543cd4afc779d6193cde92ed0b866b62640200eb77733db81c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "9511cdcd851eee460bc17467b583d3116b81e2881da36a6defab97e2b72823fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "51588117a0b3628acbd487f5db2f5e44e925af1061992683221d0ad5a0ac884a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "fbdae4fb19954afa3ca9843b35159f7ee766a53281cd2bb611da75fb903e2f02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88a18a30adc4f899b1c0f4b5e399555e6875279cd42f2877042fd4273359614c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f4ab38ded10e1d9fb0f86a56c40d84cacc95bb9ac7db8f52f9e111ccb4346da4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a0e2c6fb0be32ff4649d698dfdbef740680cef69fb833151e2ef9ff552b2731b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "970156cb6048e24393bb669a5b40988a26a49e32339d865f77cae02c10fbc9b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "287b29a164fdb6474ab047967fc52426ac56f78836e60493fc456dd5fb591b0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "052dee6bb49f77dfdf3bc69d755831165c6dc0863071e29de916f451c7257567"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "730418aac876dd0ff12772987d59560129c680fff5f320b049f597cd67c41b1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "0f4edc5c8017355d2f2346598f7c2f15a77ae274c2bd81137c385e4d81ddfee6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "925d7198bdbdb012a064b2e81550969ca5ad2a2b09090ce5f2f025079517a22d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "b54fe8483f8a3e1e3b385568aa09459890f63965df8a71072a51f462217a3348"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf15b47000b32a5cfd77a1d975032d15c41bcbec3258736fbe5b6b5636f60cfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "76158e329a8a3749f7a2593178f11137e3ef5c3495f0ef3d7b8ed852708be7a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1963f2f427a216c4ac93fee76ffe1961455f95b8c848534d8982c20e09a6ff48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3ddbc76957fabc50e9076fcd609c6bb85242f9a886cfaaa7d1fdb69e4f815a1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "44e7c92b4fa816e04fe1bd98a17234e2a282c4dec3a02911d0e7aea2758f0fea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "7a430d90226b64b9fdb15d7e657ab791d877e6eb4ba2f555bfa85402bf7e1710"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "312c6692c09314f1447db45ff20b58ecc3855b8260f2b3706e94792bc30a59a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "01526b8d1f158495d9079189bba0d8fa76f5ae041647a6ea281bd8af3d24d724"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "3edef97ddef59e54572448d8a655a531740e6b9ebad357c266b16691a3e86f72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f33113f71eb1cb2c46ac4e5f6f13764e3c34f8526fd1a3274ff75d75d6a9e961"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "98aefaaabaec647784720e2076e4e72d2c0959410c6a92d0593aee7862229973"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "fffd29c013fa455f0816dc371509021bf3dd296547db9208d680c942353fead2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "de12c878d1bfcebbda2943e4d36abc6ea81bd48613a932c4e6599fbe51009342"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "5e77cbdd92a6bb43da73141f8161bd3033affe6df7a0e95d31dc44229646028f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "03ffaac28071f6a603699b8ba53eb6c9ab59d1703865be80eac658b37dd73407"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "86e17cfbeda923c534ab78e9ef7dca994460d5a3da294d7e0ad78c3fd54694ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "c1452e0f66f4d506e4633083647d2d48f724306454a26063c1b7302cd1aebe09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "df96562b66d42dbb0e36b1bfcb3914a22a2787ffb88d2234593f89afbf14bbd4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "20ea4938089ca09691eddee6b51ebed79cf29ba5091d76c009a85dbcf79a3482"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "6b285cf23e49b39ada04fdf91b82f15c10f77b253fbc22f8d53e30b576fc08ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "baa65b2802417455247287b51296d5351e9ec21a3b68e5edf37ac838bf5ba5dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6001f4ca71894e42398a155aaecb681ecbe1471bf1a0038162b7c97eaf9d414f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "395fe57d51eeb05caf73bf656fb59d0a95673edbed9ed08a50f2388d7b282363"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "958ab4386e2baad928671ef1ceb44990aa8d7de736694dd1e437291b4c1cf10f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "245e4923ef49ec8f72923353ebeaf3819cfec14da27697bbeba5b0dcd2096f37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ce05e995bc24ad14e308c9670487a9dbb8f1b36e9ede310c3906559b69cf080"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c9bf063f368d449960f1bf4253296990f7efed663946c0d41b7fca232fcfd993"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ae6feaee5176cafd6d7a4b1ef23a4ab276f62070882977e8092f38d9419ece8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "86357637085f161c4a3bcc4f4ce5c8872bc6a2623cb88fa6857b2452fc27afd4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f956880218fe0cb2f5c0e4dcee228e1ab098eb79e8516a398c24003afdf3ce47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "55a44e3c4ebcc4da151e795adf6e7e6fb508187915695358eda09491cc134149"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "d83e71324a541a5e449428628b08ee9ff3d9c345e535aa47ae4e5010a71ad692"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "6b6f9806a7a9bf29ee23319b1a56b73198bf5479cd51d6588556ccec19a79001"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "64e6972e7d0667e95347b3ec77831ffb8daa3125abf655e83dbc4fbe2307ddc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3d25e6e61be5d532ea624e5cbf338c31c5108e4caa0a353ed9be1cefbd818840"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "288b7fd257f39f4c4003b8e2b725c8e7bf81305c4deef0deaa760962f37a383a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "004380b373ed3b1a6dee2fc0357931d8b95f60a5405051bb4a458c20cf6d2e7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c9adb20be2409a7887ff0c8ec674e25139cfe69339a2f41f9a8fad0d850e1881"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7c84acdcaf8b33cc6d080306315e231a4b368f064c19ae927b458e769a7d59e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5c4dfb23555feaaf890d8a5ce486f9f930b5eb58c1792966ab425f81c1f3529f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd478480f6c34a66258f8a95db81af9d79f0ccdea447ca0902afbfcc26cc97b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bb6c6538d53872b4def891e073213569713e3dfb2da358cf7d1444b0e68f0943"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "4aff95153f588ba61a100df4bef198ef45d1f1cfe22c7c4ab2088131023b7749"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "886a4659633988a3477763f2e5c14502bef391c3798b56323a8c631ea3de9767"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2b45459ee91427734e02d5fd067d0bb254a4a56ebd3acfbabe2ea82de4d76a14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "2f09d79d869535c44e914a6781a9545d0ca7516aab54844f480099952eec131a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3042c84635f2d21848bd3e5fec4cf827b2b284a1dce52dac220f9d8be2d37161"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "94f046e819cfeef9db61278580a65884a3274929e74665b541a17e02e03df9b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b77490ebf4c4ca77071bcd906ffa5c05391d54f5cadab910b9ac0f5f9c3cee44"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d83db0dd9d384983119e00c41c5f2ae2df9901b85671df74bbc8dcf886e31452"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "83e28248adb0f786f92b8f67eb5f6fb1a911a92173980f3c1858daaaa7d98c77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "3c3e1edd6d20cce4707b6c87f7376b452e8c313195e9bcec032726bc5c235f30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "21f08cf860071fe53dbd398d38b074febd578405558bffc2a6f29b960ed2c449"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "adca635c7318ef4e3585e8b885ddc7e4e6f9aa962f8c33aa7d1b513bef952361"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a519dcf3cd845cb720129d66cfe672b94f9ada37e2bf3b28cf1ccbe64917c2e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c45e98dab8f7e8f76965b00dbd22192d1566d7a2e90c680ea9a711596404f79a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ac842d70c33d2d6c79c520bf9c643b2bc0268b414489e2b64c77e3bce466fe73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "868aaa29a22ceaf3d222cdf8351adb72f52e640ef633a692d0870f36a7b4d970"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "a4562231e55348242029b354c81a7b0b858548dc83416ea37c6b0d496cb92d07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f390f9312f9c8bdc60e1d954543210a10340fe5eb65554b1b53f144f6b05d596"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "5a88a36454e9f29fa85b18daa562b96a87044b0216c8988a3fa343abcf6912bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e90f36ca2c667ad9d6bfce69e637aa6f107e565fd0e7ba23b8e2f73af3d55e1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "6ecca10d29cca5dc5274db435cd14630ad79a2156ab8b8bd2f70fa9300fae2fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "3728444ca79320d2fde8a22b45dbc3aec9130d89acf09745315a05f56950f2ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "6a04b67d2cd58fe338c0e3f6095ca87a05a6b86a0d7ead7c839e6147273e5938"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c2fbeb8a06534ef03a405f8249586c6ec4eacba55b3592d35123437a8001c4b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4416f70045d3bd3a1fa4316f21ab20f97627d25dc56eaf96557acc47341687ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9a1aa67c502d252a68ef69a60aa5cfea566761cc9554d3b2734c982e8c22df37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ae6ebd884ef0bc81e0bce62f87a99ea6bc9705e707c0c5053a102f6472f900c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "38640276af4c9e4560be2983d162aa511c3463895cab33371c49c0d9f215fcff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4a23c72d69e0098418ed2d76f5d45542c76abadbe9295de152526aab6959c481"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f1cc9ece73108be520e1a6e174b60cced74488e0adb6fb309bc3d4b53dac300f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4f9ff7da823ab0b7ab8af298ba399abf20c551f5b3653374888566aec1124c46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d7e84f805e73c387dc02e8c58d0aba861e00b31505039ea2f75b6e841bd3146b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1bf98b254e090cf12818681a856befca3f4e0acc9901c4308cbd41e400d87108"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2e5c3f55a042c8d6088487fb6345ff54bd8114c996491a14e22f1f6da729fde6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bd5826df2abb809909768ed21c4a9f0caf196226cde9f6265c056bc2154a0af5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9a3131355375a1fd0a15ae3a794ae4b833ae08050b352c26c35a69db472e9f9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "8fec20f3e34bab9254a3cfa3e6825ac63174ae95de8d759a7f549f6b85f8f66b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6276c87c20e74f52ee533cd3de4d155ed667fe8ddab1c3d462bc889a4181b24d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "218c0cec407d486155763caeed6e01875acd39f56821c98c4d51b6e1a9e324b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6bdc265e871ddea1d5ab4b283750db9fb5106ce200096105ae546bf73653e5a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1259438aaf03b2ea4ef1e5c4e475fed5b5aa26982846de546483fc5de3da05bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "a2f788c3cc32b27fbebdd785539ff442736c2a2daef9de08972cbd0dd5c44809"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ec215e0bf7c1e67ac8363ed865f438ff163d75554d6f58adf78ae90708de8d58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b67cae85eb29a6850276f815466620ddc27af33f418e07c5c1adb2adfcc333d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ce451b9fec6d6e02445b307baa7cc67d3736fa35b7c7ba7e8787455356061957"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c063377bdde178156a6cf9e5ee66d66b4e46b3b38462c69cc5149073311a6b2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "fa3903a739af30386600fe70526d57ea95b888d6ff75be715f63af1ff9f65d63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d2dafd5f13444c3e7fbfaa3a8ef4d3f0cd3d628d5127062ae12b66408c40a3ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "15080abd48811bf59173ff99b05b3063ff98e149efdf86d7bb0d9f52411f07a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b42825600d380c4177eafb56432031f9cae7072666ea2bb795d38c133af85f7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6fd16aea5ae20691191dbf6eab61382aea78e4ed6568320c7ec787a5f625ba47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "3e130155bbfb7dfc8a2dd75fdc9053de81861638db57e78ac74479112725bb43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5cf5cc9e554f06264b2293a7188914e936004c8e1fa051675e67a44dae846f52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "486a1482a21a650669bd449449c9b682f04d3abb2c4011ad31fb3adb1479187e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b9b0b5d528859313c1213fec88f44b7f945c38d20b48903f61b0d5ba2db9f331"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5a37a812b319a8e98161d7acda55e1e24f1fee0078ebcd829dac3f1daf99221e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1e496dec8e830de3d70a83fe0d77a508c9cb4b0d260f2d1f28652769e3adc495"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "803d191f5170528e42b6a7793c4f988d0bb363fa9dc58bbfb74ffb7bec6fcd4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ae41a4183f9e40af0310f716e51f37b08e5d5b5ab02b210bb940a9856bcaada"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "90a22aab4a2e5767212fb5d4d3966258cccfb0b8941780957c0328267434134b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "479423ee65b5f8822cc3c0f4941b8979de51006bdc763f7115c2dbfbd73f4181"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "967fd84df9638528dc91c4d37812b295d10be16f7ec5cfd01a1a6475fddf76d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8e38f84bccee68ae1433a317c1cc87039a9aa177d4319b27dc3b907b6802a25d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c9ba9ede20fdc5b19a5ef23c026e954977f25baff3f471b0d1b6d38ee85aa08f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3838ff0a9f9933f7b9857abdb4ef686f7610219cfa2967749e34f9836ea16139"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "33caaa2f2668af95b1b826e6aef6f059c9bd98e9e098f5dcdc2294f26718ed4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b6eef48a9925feba1b6e5ab83c902e4f476693de4c2e438afdc6e8a1db6847a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "bbe9b4a86ea0e39bb94f9acd20c1a88726d783d6b1cd387764bc7659567d16ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "9c7295ae447c270ae063d0c43ecb96fb642ecb669bfa196fb932d69871566c52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b17fc2014848df9a0019661db917cdb02f0c8090ed381f4a7cd4c07c0a2f078d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1bd384f528dcd572c0bc86a40150cadb633587752abb3eb5187acb2ed62e3f17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7d7f079c618b9c96242f4118f00904e08d58b49545c5bf099a1049befe0c5ffc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0f8fc620d6c3a39dabd31e8c1ccfb00fa2fe947d29d9d05089d54a0e4367ffa1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c8d98428db207ad56539079c3c3e9f7f58994c210a3ab1d79913b0d2f553f61a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2b07148e4af01a87dd5512b4f2ab0329ef46a0607ac11dd17757f2b82ad51c15"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "71b79bac30b854ea7415372e18ac3343bfd93bf915e1f68ba486c47ff40be56b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6c5068b2d28782c73528de1a597d191f5ad33eb22d1e5e40e5e8651e28c1eb33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "f01b152f42dea23d3cf46a8c6f3817a03414ab308192897aa1fa164e98774261"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "acf852a2a8616045ac160259f2496cccc1be16781aeaa4909fb70ba4ae42fb14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "000182cd205fe7f6c50b1559af24ce5cb354cd1ef8fa5484dbaab5e405a22a68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "cc9ab4dcdd661ecfb9cba408d4906bab570002cb31a64a92489399208f29327d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d959eabb50aca38d52c6ffcdb8f3a439f85787f45ce7dd9fa35b8f6319ccccbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cc9099bdf318b58a5c3b64f480ebb2936eba10df2796dcbb7ec09d397d0832a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a04c4bc737bdb35e579b97181e5120930696022ab85d9a0a47f055754d623673"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "731850d0fa4fd2b8f164b9ac337ce770b019ccfbb0432c4e52c34561197512a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "344bd10977e88e9e279ab3ebee1742e1bc5999b02cc214ce6570d71d410c935f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "671679bbbf1be35049a8f349e6e11e6a2f936f84411652d15f06545f225350f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "11e5793923f273d607b07a0168047a80862b0a7570e43edf3a444e5b1c341e48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9598c00cfa636bbb9e8dc38557d29a6d74dd0c744898cdb1932cb73c7a63e264"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "66fe5207667f55b102212281b4e6143870fea3761f4787a6734e2962c1c2c3ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2a906d12b95eb686cdebae60bcd2b6677c9a39b0cc684beba855159e8c89b4b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "24fc9ad5e9589174867c939612ed7e6bfe3dce899d231998c0736ef4c4130be4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f1959e4c14b3fce94daa2e60663602daa1e3d90d5e5ae0433b9c828540546646"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "79b0da6dc21a0091d9051267df7af9abe66aa0bc6eb3fadfd3679265fd0864ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "56fa4ef94d940f244594ff51d84eb332c37c790cbb24651d8a2082c38dfb698a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "cef4fc28b8e952493a84054023fd59d2bdf9727606b051ee778817b7e6f1def6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "842866f17619cc904f94ce3b00d2b9bf6476bd14a261c360d9ac90089d33103f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7b7fb7bec34a406738dd7d53f02210bc3385be0e0a328fe02365e8b707371546"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "7037c2c3ff393f12ac9ca0b5befa156f06baacf671eb8700918406048302f408"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c08f36b7c89c337c4eccc6515ba1928f90aacef6cb66f72d8ea1326ec54403e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cc0fa0c41b392e60648c1163456e80be483f34942471a19c5781b29545b38787"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "9863bef2f4bfe9f3297c9adfee82acfea6ddf2100ee4380871a4a3f3ac6175fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1a79c7e90e31bf8c8a0690ce1aa418af5e4a414f2f08424afa3318d4a13d4242"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "d46acae22406d8232091e22782c90c344bf4dbf62a1c6fc0167c7c53e5a95e0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "f0548917bd159af25ad89bd83a48d0bd7109e5379cc9c96406a2b7a951bf98d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "eb88a904237897562489eb48dd83f5f8cbf52dbc40c5e8f24ac1929af52d77f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "910470143109cfa41dee4239dc059df76ea56013f4a0ba9afadf1fdcd1d6a0e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fe31a29b66e5d5f086cf086210a6c9df9e461d01a8c17e37139280ef5eb93218"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f8f75c68a230e30ee7d623197ca831dd98529608a82e823625255a8cb6e7a995"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "3a8e0d7e65eb1ad17f776577ce38044af71efad72601d5e4f50b8ea2e5a7f219"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "cd3bf335f749202b056e7c27aa6db0eedc0bf35aedd8523f4724306103d8e6ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fb89c9d911d299d1d0885c425391bdffdc485a8715f5181174db0d3a58b4e520"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "32624696e44e583416295b7c4665ad9835f3a22daa6e0710f420078b103d3278"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "ee629f4a3faa5e22b986fcd3bcaaef62a7630002f5a04d6372802f775cce3aa5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "f7b0d4120376ac83f65e10621a42580563c4eea967d84da713e29dec4fb98cde"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "fac91a1c34718c810334d96adaee1d22e1ab3c757bffa0ebe4c0b65e9eea6b25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "72c513003529699d803623fe86e80c4981cc7bc051969e7c272a812698b7c768"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f6120b4c050f7c756bf7be9eef5a9c3ddec61a7109d3d966ba7cf7210ef92e9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "dba53dfb1e56d2af0e2d6bbee54782d9d3979d3042e0f8671b3a588df21e7119"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "6df5bfcfbdf90427abcb9e8f13f561fc55f08cff420a0477b566251b1c00eca3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "fe99cc8826b593b63bd21cb8bab7e0edd4ed50f2f4bfacaa8720033d3fb3cd6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "95f8710277c1655bfd3d7aa20ba889f777d671097000d3c4af5bea631f5bcf88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "c76171127ede972dcd78277a8f7909051bb9609927e61d93e85958c635eb503d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "15ae7c54e0ec8f1e72972f0d39aaefee479fbe1adea8be846e13aff778490272"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b6fd59ea99d593bf949677d70b3463fb72dc0e7e6c56abc34259513e07a485dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "ab392a85cbbde5a38d384b17d82ada2e8ae3cd2e244a43e09a8ab983afad7e7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "319c3f80dacf44ab27188e150b856d64d9217406b3850cc709b3a2fe8ade8a30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "180283ab1e08ce7c5128e5223947c8baab87efff7baa91846834381aca9ddbee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "db90cc96cd3049adeb1f39a26f84e36b78abfcb9da606cbc8252c40ad3e1b555"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "783f821c8e1d91ad6841dae99a42ff2ea11b8ccf41a091a3c4b43252c42787c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4c0bb6a97edb149fe9908d10dda34226256f6348112bb6e646c28e8372a96081"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6999c3945b25faeb518e358b434203e45416d1372b909927c67b53619ba78b05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4947d0252e39471b7692c408ccc95f6cf543a9f969d156e0fa9a69a625d88890"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3597b3f12e3b4db416adee4aaea46c4ba3268d80109ec8f308a78c6b69752b80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "99d054b2947fab7904f64907b66bcadc5eb7b7429c47fa7e93730976b3e1da59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c0610b0f058c27685f32621a73f5e23c9b95ad81e0723fb78cc66262537d1bdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "af65a0bf31bda517399819f17b85aec29055b0036480a04571adc66104a5cef6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cdccb36a4c1f57d5859f5fc46466fa7d3c734fdb20faa2c094b397ed0752e3f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a77d0fa62536fed8fbbd92affae0670f75fc1883205bf7b37839f10e5c1604d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ed6022fc40c11c91fe6cf0a82e40f6ad1bc173d3d42383e39d1fb5701ad33013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a802766014396efb09b646d6bec11a675eab930bcc4ffd0abc7333f343ac9ad4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "990208eb0a41975334a92f7d837a816fd81629f162d73a3cacead04b56b88126"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "25a4bea79d940869c65af2e78f1b7cc715c81f083ae424e97b69d4c6884c557a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1000ee5bd0f9f7cb6431a25053c4759b68004d962bfd0601163d17b24190dc46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "354e0a51fd4d45a6ed5d843f967cdfd9a62f57fd35b833de2e37d880e0f68e0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "77c5a9035b6e83e1f6c59b5466f40b28ad5a6e48a10cd1c6ab504c52b4a4fdba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "af2621d751e2c94dbdc7d22462343b8f221979f77c1c06d17e2d2a3132aa5a2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "21fccbfc13514cdfce7480b589b4d1da7a187aab01337d66e2d2ba9909041265"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6027ae0700275bac304732dfa4d48a8658a444c8b32cefaa15cbd97caa716434"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "0f5044f0bfdf7923d09a0d4fd348ed4916b828f66b820d23e4b4dbafbf951272"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "679c4e47f17ebcaeb1073d0576e5f8f3ed658e91015ae128781ffedad639ac6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "475b831336b7123b64bbea9cef05b2d51594b6bdad2a933e8d5b3a33acac401f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "119e2e7ee2c2c423066eef112367988b691fefe35771157a777e4cead58b30c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "552525d06606b513464242c2369c5122a900c0dfb5479a88c9cee669dfffb8df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0c4fe67e3a2376fd00e3c882a83c83e09f34d0587a642fc919ff6d51eea8717c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "16f0546067830150119e35ca4f439b0e4788e237d79db009e814d96c8bcb55b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "34809a1a79cb8d6bbd346bcbe238eb54ed00bb2a62fac48f164a1243dc42f87e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "cf9a146835e1a40f632c8332a75b311f0948d8ae4d4232128df4e7eaf82ed09a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5e3a0b50a3f12ba3670bf1b03d0bfec95f6509c6145a0506d0b35b1a2d979a41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5affb6a10fb4537113e4448b7cca57557d1c67208fab4dd26fafa19f83de75a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "cd60954b24b6e0bba4fd4b29aa8ded5204d45f129925e53d3b9eb409adf76e02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "25b945cc9e4cbc493a9e92113db939a24e5429f50c5d1343647ce655e57ce571"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "495a0fd5e7fd55cff83956a5eae18e70010a1ada30860c71cd18d8cf8a163037"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7ab3abde357a8946b4de1a3ef5f0a8b191e31e0b81563096de636419dd05ff8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b2f79fe40050501101675c924342fd4264d960f76e34564372fc4af2cfc6cd1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e353bb11f652fb83a5cb823b12b53f6a0b44c3be0cf78dace7f88c65da7c6e55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ddd62731f9640eb309c6863c23a3dd00c698561190b59ad8c9d8d024bda49a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "cbddc62cef7b41b95e143b0151c185c01ec24f0e6186a3f01d8fd6a329c5755a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "af2e5424db6e2dff412cfcb940f2338c89e61ba081bb147aea4a5b25071d7544"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5eb34c2b7f7c4c6500c752d033f3837f59725ba78cea164618bc3c1c46ef7db3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "26b2274e9c4c79b701f03fa2be9d675f2d48400fe549fa1f1adc3fb9963675d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c50f4dc1fa23512b59b59cca288ab050eecbde73ba3618b64d9e4d1aa72ad6da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37495d0323fb855a8956dde4bd74227ec9420c13deee8dd9bd266a1d96c529ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3504ada3ed7acaf9453f7919d4325dda9d63d8fcc474f37882d0928828aec84a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "03004440710429c36b1ce5436ada0df93a25127cc4e1b85ef2533b847666c402"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "70bd2a5436c00ba0a316b7ec09d37f0b7daced63bbb277d26191522c246a9f74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "7b4880063c32d6f6d637a680bde54a523dda078f8204317999fd61f7c1b49177"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "b717791c919ec3e9a5b4704cfef5d608849d28c53f216c6229c91dee20467e0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d0229206fc919f01400b281eb423cdf0f6472010fe8eb02d29fc47d3cab4c32f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bb81a50775699fa596fcc3582e4ab6c651fbe0c4fe52b222f686fee6a1929785"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7543df7e51fbe19b87d5d318694593706b10b3d36ed39fbafe80fc744c1426f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d50b648f983c2c0c2ed751143a9aa4e80e5aa292f1ca9b742718a78f2cb6e8f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1855766a2fff92aa739eb79f3821cd95b70ae798cdb9dc2e6d518c2a1e920dce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "226c6db3e9e762cd7990d46a47a25b915e8f71df5c2031c429afee83affedffc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "dfb1ca6bf3f942074ccfd08807345b28f855bdab0cddc02b6fd8bc05f9f8de2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3a271b95c841e60f69854a37fc8ace9c12f3e239b11aa37db1096331175bf748"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2cf54daf6094044f87ad1854011ee49225fb8b863a8d75cc987a1449dc39521e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "bf83c8d953d571175eb857766f1d55d76e9d5b180e88c9f80c908e85187b32c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d2581978f9cc5eae715a86427bc7d174e79edb1ae87f1b5ab6a7f2a0d2bb5b1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9080b2d2d4e00aec7d1d03108edb8cb7fc72358b584a58b649d8b516654bfde1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1a7377308a68046725fd514a1574790f2239ebb24de5572bca14bf14c32c486b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "12cb04ea1205a0e64ccd2c6e0db12c1a78a744fdaf1e5577bc5b7d9d19e3eae3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "d7a6aaf8d6b15e437cdf8ea2444234a69e9642367c01d7d885227c5121de97ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "e8ca6ddda093d06fbabd963a272a121b75ffe4c260767f7ecb578101696e907c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "105366702a1cd03200a21260b5e51ca5e14313c09babd5cddfc28182b011710d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "c8c0b3e248de9a6e3ede43185ae3d9f49b8a7f3c63963bfdfdc7d53923b15c28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "eb39bbdd51e2cd4df54a2a756a9e9fca9935104118b427a94bdd37b65decff11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "a109b6a6a52e3e94465b5bfc345b7ae4d519f01820998d208435987551df4ca6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "ae0df58343bc0253a1b5fa5e35873da21796ac7417782156b7cfc0f1477f3bb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "87d31c3bf1f1e9413e0f6cb02e502d8d4a7b81a884cd3040bdccd644812ba668"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fd72529fb396a385bb8633406678468c26056ee55a7256504532fbbb0af465d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "91556c6801ad8b8e5cab8d41a3c7ad39b12e5cadb7fa6fc0798cca35d581440b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "86fe258d11d9eccd5ece051b9d64f8445da7b562d2a2781aa1e261a61f71cafc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6d54f71dc67e005eca4bf81a0f51de9abc5481b8a5395bd77f752313e0b5a55b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "349766efb40df3d7fdd8739d81cb615dcb19bdc2359fa168268ed3ce1cd31b09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0cf05d4fa3ce78030da80572e3973269e341de171510fde17174a9bfada4f7eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8bba587c46f50554b76897865b48dc97cedcc4ee63072a13201e6f74ebbea44b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1bad0930b06e66cda02db25335fadc3d7d7e328d09830ddbe991f9d40a713d10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b72cdec59f7f544a222eb88595e8945e8afb028184e48ec76a4d835a348133ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "451773d36d1142463e24a22bbe256ec16c26d18b510e047c98b34526bdb356f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "809598f020cf49ee5898f9e5bcb6de7df63d0e27e0979c5f8b588bcad5b69c83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "db8c5e5122602da9e2b7bc414aaff2ebcd2cb1b5c91015db8758771c3c3f77dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "da116c28862f149aa9fafeef350422876271ed715ddb66c38ff211ea1b238e91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4465c19385b3662486aa846fb5a2e4cb9e956128a2695aae45db13a7b24fe35b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "55ede767d3980f4ba3be17fe4428d1efb9543ee2e60c93000a8f25fe24d15d50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "35fa36bf9abeebb5253cc2402e069437b67f1bc0835177102657e012eb02ed61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0bf6fa6e316e0441a272a6ff49e6cbcfb792eb5f22d0998196580f7d15cccd20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "fc55067d9bd2f1459475346279836ae480d1dfea53a5c9066b743d2beab4d9d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5ce8f9a1fcdda0eb60ad8af7842cea5982de8036a1e2c062d696269ed4c494d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a744649583696dd26f0e5f273b3f92b60eac7e19544b187a56bad15f8296835e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f9de7ac9056e484936ffca04b15b14d0906c0baa8c1f2298883686328e5722e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "6292893b742157c855a5206060550b58b9f80a4a21141dae72798560c3cf230f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "04579d05a069ffba0e745bddd1ad049ea9f06d560af7f96ed8c6093fe59a44f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5cea38cda2b1633ea06d5b6bd66e587d3da9a1805436ad01f3c5aff21936c21e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "eb1dca3297949129b08eab3ec891a2b18f10ae6f48241a86d6ef9353add14c99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a0b73640214c3c1659461c8793782f035b14c26f6078cf4a175ec40d7869285d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "d54215da901850c8340a0c13c73d401b586ced6552c81d12d915a9ceeb5a2560"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "599632452110201a104d9bc9b6ac982d585879e0fdfabd0c5505531029fef4d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "cdd15113550984d574ee01bc769c393ca53847c17587d5f3cd72a28ddc8c575c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9d815bddb37ddb5632b5a80c23a54896b3e790a3b78d20c9c3e75f4cc3842ca4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d7df8437ac72c65a8357c955aa7775742638d0ea8e25801d3f1fcbe101bc2eec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "eeaba6fea525cb70ab767b5999f84c348a5c4c7a0853a0d07d6b881c04fb78f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "4256f5f5c27100ae2465dbae72b2835aebdaf180342373a944455b36f2699508"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "089401654a02c6b3cd5bca1f67944775af74182443dbecdd0876b6cc8436f932"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "d60dca9b007cd952b311f5acc85c173653a8867d5b0fe7fe8c9adaf297bf5646"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "dc04ce615c45df36f8f435002fbe4cd7c22844731e18d105364711ff84bac851"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "41dbf77542b1b645096fb7d666ba6a760e170b26971e823bc3673ea8c5389011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "6af2cdce640b9b39244340ddcee7a9620164308cb2435a91643aa8858ed634e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "139e1dc1fc639c84325c1bc223568d230747b07fad4a7243ae3b410e6131453f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "87b4115ca460a4877422561b7334d204b05d42ca9f4111415c6918b4698da30e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7cd8b316d48c662f9120856f9a3c68b2e70bfcc3cb0bd1aab7fad08ad03b32e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1004b95f834cbac4b90dd2322771e8dc8a85817b3b9bd5e3ef06fd0baabbc848"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9417b55e45473a4708129e2aaf4f9d1066c9ffff97b46966c3c7f5b386b7f0a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b85f6eae8fcbfab510168181965b6edcf738efdddc7f8ab653d9681fb8eeda69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6c2394b915e306a35c98cacc75dbbb5b9d61aa8869488905eae350c74e6a8b7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b11621b6012fa2f3651e459b9662c8cdc69a33657987d061c0bd18ee3e291858"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "76373bb94d42269c1df5f8e9a3c2f70eed993231a7699969b3aa5563b58bd888"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4398f42be31868152f36e794e4e71601bd2b0f7a4b9a0d358cd4c41d6b65ab35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b8114619279a5fd3c3c6a0269a2ca6e4e2d49ac6023367aba31745a9add26d81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2237c70e3775a24cdfe94605ae939b193c7ab7b1dcaf1f90f4333ccbd6b2f57f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d340f3839ad230ce62aec254fb293eef12935ce0becfc2a7fede1ec6247648cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ef231ad3009eaededdb597101a82ba085adb7f5563699b910866b670bc7939db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "fc850fc2f25df6d367e2b30af359f2801f541410550f87e72f3ee3fbdfc01b1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a206dad34a5a87975bf1ba725cb39b452306c58655ba96afa96fcd9ecaaa577c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2f4b88520914deb3d84f53f7406d1dbbcb915be4eed5a57909b341c219916f6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1641a25deb0a4c0dd3b079a4db029b122d42a158284d7d344d6a677609e3524d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1955ec43c4160460bc251dd1b5716bfd4a1b4429e954e77e29a467704ce9d84f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "46c6ead91ba4ee7544b901edf80f5b359749988d77e34a3ad029fd0245eef328"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "99595cf95edebfff07fa204b03052229e2bf5a8df355c61ac6ff8a16e390b44b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "29c04dbca7f23c79190e86302b7f65318026c85c9cfa2811d9b2754d27c1b180"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "57708585ba641bb3064aeb5685cd04405db2594c0f421d70fc2e82566ece62ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "2b31b39913bbfcf7cca3935a79aa34a5692a7f35b2fd50ff8f52f8b07ef5e21d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "025f8ed0c686be03110aa7b725006379a11ff6ee88a81e98aa68ff461f244845"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "c0af1f147413f7fde74b7ed0fa6a499eca678eb531cc22b36137dfb8c9dad92f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "45eaa77275984e4422bdf96040a1d23f3a1392ec659795175fc75b3091972b83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "5914f4645d5029a1f5cd5d43b024a4bb19a2deb6ac1e674c36d8fb820e90814c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "5c9b5534278ee0d1924a640395c1ca1125e4a737a65229fab65ccfa106ff6d72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "51f6ce6d9d19f534c9f50e861b60d1c4576d677f8c7a5792a694277a4edf315e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "34e90f0039a257f9befdfbd4a3400dfdb7bd19e140028edb3f44474a93e42db3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9a196a35ff9de397dad1530143bcab8371ff1dbcdc26596e154ef618a2382c88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "07783c74e2e73166ec602be18a9ae930dc7838d1b86003a704aa9085715fb503"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "49eaecc0214f5bef9e28fed2bb81c321fc6c5ca46dcbbc6b24f903ef82242a00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "70436793c1a533109a4c86e1d6da186343a7743ace0801829e7183766edbc638"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "7ae5aef8921696387c0cf187bf5e2570e1a0db69a581c0f4802ca9cc2e8c287c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bd291fc5a330d32cf0257cec97f7bd8df1385fc3bc8760e5384203990be08f91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2fdc498324fa372bee39e206377a64fdc1c09d23003e0a2f8277abf525e72153"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1177b2eb3331ccf7460c354d421065e9ad5599b665b2a0d4e1a49fc3ce794e11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8fa5b6b34722dc73723979178e615f9be8012270dfcaaa9c31ed3c755d0c87e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f2f29ef42db036da9a441421803466d73b3fa020aa5f6017faa8ebf6c1652c4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "f16546495c774274a5a43752e7cc2f2df7446df8b16756ced502ad301e28732a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b04491061d7f6dab3132c1b8eb471bdc74fab7d44e32b3703bc18d5a63190004"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "af5abc35c9eba9b47ad2ebf173453a8a386076bcab8696c5d7abfe3c5374af19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "780b28ee83c86c7e2ce7d519e15ab4edfe2664f5226ed1d0597d2ab9dff8524e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0f353b6c4df9a6f9e9429cfb988ea960fd3266f50e61261206a4a231fe3baef2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b136dd05344f1cb9cba10c41a365c3a345e62260633f06c421390ef2d2ea7b2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "535394865b631d91ec07e0b6fb7a4eb979e57da06b6ebaf14ee5547af542bb7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "5af87e9d680c83933d2e160b686ee8c20e59a4d27d44724c87b878b56784d2e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c30b00fcfc19aec1550f9affdc60941501dc8183ba205d955088192110164f9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "7595fc0f4f3cd5deaa4656ec42a8f40c6464806af690fe83a8c6fc397da4c341"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6b030f99f6b59fa582be35042845ef629e4ca53ef1d5a48bc6ed2f80fb2b0610"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "f8dfbd99684a41cd70f5b0e05d3219f0dcec9483a303185a0ab99e346164c2a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a4d8d76cf50feed08a1d46dbaef19bc07306cbdb265dd16732c52b615d39e656"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "486f3d363f3cd2f38a8b77b2beb48772a5b13a173703f5bedf025afce2d145ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b4d77f4b2680af7163ea1cb0c5a4794c790d1352013d7dbc6af1c7a739e1ec40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "47c8869d6822b13215294c3fac08a4a00b8ff6f71cef9d9f2ddb76a70abbe92a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b08443d9546b583553f1a4f1bbf949686b60925725e1c4a97d303bb74574a944"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9bc9f1eb8ce6b0dbda711ee592c4e89994384e7ebbef72a8cd7f50f1a1029133"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a1249e8cd856ad119abc5477c5b629681d180be71df394a1b121f6730cc479ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "e114d2b9c8865579afccd15e29211d7c4082a32710c0cab4007cab0d0cbe1497"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "6ebd3b9da98cdf4c8da5df05d9bb142e1aecd31e71a02bda048bd19a3297d4aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5af93f8b438346532127daf91e66025b1ef559d104e1ac5f9127b14c0ac26261"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2bf09d89d9aa77031b464f2a197202176823ffa5691683172c2c400996b7576c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a202d2fc4a8385960ef5c699c9684c11038831656f3420818ba4ae7227dc0ec7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "9fbdd9abd5cf7a99dd835b47e2a50583b5126907837962e6497f7986a14de6d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "bf2fb480bb19c7f4652758e59aa3528237bec05c984737a4b2cd27bcbb5ca43a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "eed4e81532b5f9bcd9d3477c26cc5d1768cc23caaf32a41b2995001d96035375"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "7a5ba42336f009cd219a7ecdf5facb1e08470bb2c8f9cdfff8058cc99754b524"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2cd9128007e7d556f0811968023a216843433f86d70883367361c739dd6e50ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "fe7fca32abc642fd69f74ad50784cb45e30c64c644baf0db9af6ae917f701b19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "9da0a4904a71935d6b624b6e049b0f6ffaeb050aace692a80dd3e0656c793f8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3be7b25ef9179f28e225ddd4ea5a4282c9e7729dc795b2c577f7bc60c8853e89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "77f7591e5ac574a47f6a961c1b6303fa02a454f4957b066d4d140eab54fb5062"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fec3fc11428734bfc86f8704225b9bcae964fd374a65dec92331e81b6a4b1a2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8db1071d082829300ee8c4974d8bfe2d16bc0ccd0a40967445e0ed5371c11710"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6b6ca2eb265b1241eefbfb6ae3e9022606f99cd1fb18235a1ee4a6a32a57869b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7feb8df77a83c09ef2bb35026dc8dcd193a41d259f6f9132f552dd52aa3bddc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "995eba586268ea1c7d763d84593ad0c2a40bebf01926191e3493dc5142220ec4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "d3ce086729f126237ad5ca8c089ffe47c4ed54437925712c07d9530afed27daa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "44e718bbb180e55f5d6675236d0a7f9aa6198aaf82503302e769e3fb9058ef5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dbe93404cd44f27335a694a0fb735ebb58dbaab92e4891a081d0ec1c77164c7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "778e8f73f4662e7bfe6008d4a587dd909ea30cd3fe5fc58a06fc5f77efc1035f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "89eb09671d50267e8a3ac91e4a8ad0e26f8bd4901dd5a9f2679b51231ac11b45"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c3640b6eebaf348a0a34efc84be4c6b0cd1168a865d33ca1de1587f3b480c512"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "30420e93729ffed8e08732d1db773839a5e1687f464346a21c88dfb6fac609d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c94399b535a69e2890e68c2e0905f1a339b04168ce751d776251154120c1cfde"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "457bbe4919242232343f56a30afd83770c1e36cb83072e7cc7afef06509627c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f062157c37b0ddcefedd761e5a7b011546f359c5b8c5977216d1fc41c762dd1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "dd0261715f5093b1d17d1e2871b1e49c8c0503daacef91f5333abeb3feb584c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "96ef9f8d6cde14f3c72faca0ee768cbd926b427beb041bea3c32ddedc8e87500"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "727bd57a04c5b6d6dff0fb5c5b4f4a8ce783404c1bd8517fa00e3204e2f97e56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "b735b150347aafff12b81f456bcd8e036e9a1c82c77dd7dfff4c9ff727e33cce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b7ae594d3ed3ec9ffc4546516b63ebca633c1093315f3936cd4159d264a8a846"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ad0d52c60921147cdf069d5249d59b60aa5ada9fb74eed5c0c48781b4a9cafcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "182db23b0fbe9ee542c49e56880e7e57d03c30b269e18c3d963730dd514e65a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "652cb71098e755b4607bd44a2447e948ffee37c5ffe6472ca5bb8cfa1fc18bd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6bef5089abee8668e7d310c9bdacbc578a45affad6bb69fb29d92f5bc99855c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "c8240c0ffd8b67eab5af0c1c93ecb48e2234d4feec274da75dd925134ba26c8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f60c9cbb880d7dbcfeb436b3485297408dae87a0a2d0a71dde3a063c9f72f395"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "04f8c740875b512537e7cb5266f58ba3f4472683cf35ec7547728001172f28ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9935e2c7a175c5acd427d69197296aca405d69caf9b706e2af049e2f799bbb99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5c4fca51ad4f3dda7e207e88e8949f8bb8132aedb853e35bbcb61a23b05c4876"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "eec10f431301ed6d68e517a4b7c8501613b6554f4ea6002d4bfd395d60cc9eb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "86ec86feec79290480ec5df4e180c1e6eb860edbe81d1ec0f94d0b540eca67bd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "9ccf94747545de79b4257fd671e92c91475bd109df41913b1d3e7dfac079b886"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "182641eb20ac5c2e613786942d76428c95d113b342a27e6473c711cacbb06df9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "4f34c9518989e1d2385e14f53efcdcf7cf738e10c0b214ef08c8d475268916a8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "1fb715154df7a62a4320c9080df716de2174e8d3f92096f6b1aeb37f0eae1bda"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2cb883d211fccbd30a724fec581f5332ee3d56443d8abdb0da9273b3b8c0fb53"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "9fb46fd2ce300717c258ec140278d25138ee5a6d998bcf0a18a2a76ec981a48f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "bcb4db62ab6719b8186e88b0abeec89a400cc5f2c6edc4366040abf72f113021"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "9c8dcd91cc8f5818b9e523e37d4b0ba9278883f72895d810003f0568307d0d0d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8c52805fd89aa6ea6da96fde7a360ae7dc3497ce2c767ced3829861b38dbcc20"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "bdfeffa725459f11363966eb27d70069a8449e76bee11ab1f004349263b252dc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3b7e351e99b5cc2896efeaf7352bf011c44350ed3e6f4e7dba4aa1a84d6513bc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2169d53ab8b87d6e053637b9a766e13da6b4b14e1da7e7321a05bc8200d1403a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c7b8f8a8d89366aa9ba1f9751c997a57d524831f21ada80fd3e6ee12cba5e484"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4ebcf2d6b743508423a1c19c52ec502a11ceaf2b997483d4cb43f4dad81fdd7d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "29c144d99228fdbcadd86dfa0120b0bd1e0e1875a74b9c4a235771e323cc737c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "230c543154402823509c20cbfd456b1cecf05914923cbf4ad29e32dfdee7ccbd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "87fa015dcfa945308865c304dc1bbd537e69d68b929940b883981e1c35b8af0f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0774f283f9a89fa04df5b57e561460ab108963d82c3a3551cfd810d43b7b7892"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "4f55d799fdd362a87f26b2dd111d611980270ea4984956d45816cede5f43439f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bd8268aae6a85789cab10d9fc4f2e0bc4bbcdee08e5cbfc188c3a0764127961c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a7f6b45cc48f4244fd67b28db27edfd4685ca912174f3677d0f3cef7e6a93b80"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7dc3ac9dae1e073709e299e88e47cafe9478ad618930785045fa0acf138681e6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f38b469bfccfda22e8f335068ce9c7a7b55b8ee165492e82377589b74ed99bfb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "8ab1c905671bf3ab579e937ec58e6e7f2387eb649136ea8c6deccaa04fb00ad8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "b6638a471f09fb65c8f34bac7aa010f80c141e69d027767df809533c34c848f4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "316a46a33b0c9fbce73cf34e0c60bdb04784384f47d9d84262011ed2c25c277b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4944285f0a277fb90eaf1899e62dd145e6a89f95fb9059c4eadcc4c4b90296e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3288a4266f6961d2ef6308783a784436a3231cb4470318339b9c7fd097fecdb7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f41b6fa8ec1f56da8371bd92b957797e27892193d243217f4288e8292941ff5d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1a9e34e44acae43d3b0494a98abfa6f257220d16c46f1002f5885f9c7fef6002"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "528847cb7ec54aa25fab979c8a3b872590eb1ccbe0ee039240d67c5311d8f4c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "0f39d9ebdafae344cde326cbef7c977a37c8a8cac0836673235ed6fc919cc284"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a53d1428ab650e4a4a0e2147c90093f2cd92f0ff79696ccd85d99c943ab0e68a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "55137cf5b1dc74f72714070b500a90d089ffc9c9758bc7bfe23e616447e25aa0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0f7cf161327150d48bb1c336ee4be582eb361c990018723a4f628cc66b284ecc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "460fd548b29cb47b45a2c696ead62cc2b484fee6f8a0cc0d22e64dbf389dc74d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9f38e033d2ce8f76a1f7581150eaa91e13cbf4c37fb3dd20de0daf7f6ea9f118"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "96720e8665a8dbf64a2e3e069deb6d82c0bf40d247fe9256971678065cae27df"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ca5e80b5d5ab038a5b2ca69d96ff640dc413da14ed45f60d6445abef341d0d1d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "42acbbe845e33f78c1e40d920b97bfb48d448175e3d0840d7d094c9c61e39574"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "821ad1981854714ae754c8509ea71ed7b01515c61c0b31de7c1e19f4afd76598"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a6cf5e55448f57e63d2746546b72f8ef2c358c636e88d43856c948f2fd8e2d47"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "acb2e8ae4bcea0b04a384fb57a0cc73c2802af9c9ee20b7f3a0c545771d7d048"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "73b650a49732e42fb6d83526194bb16a65a0d2be6d5030575cfa2658dd3b3136"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "70f3f75cab63dd824ff6d6bf895457d1166d8f7511518def56f48a257157d5ab"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5d26a17bbdbca8a957dbcea7f9023207e1401a33191b9db2e57f29c67c7f61d2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "be549af1f43cfb6b406a72992e9e5dbb60142bf0a55cda43d3b8adb62573a00f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f2948cc24a4954678eece9c94a9083540beaab586568bdbe8e10c48b06dbbc8f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ead78ceeec575730613a1f4ed56c40c8562a73e4cf5daff2e92badaeeb3ae6f9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "4c0a4a418485d33989f1b6ec5cf6d76b70326ca09509a5f7ee89248d45f6d5f1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "801d91bac1cf7e0ef3f1cf725eb714f2f2a5ed473aa30b987cd315d7251bf096"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6f115926c2e4dfbaec8ee88816a9bcec7b45a3d5685cd12e4cb2a39ccfb0a98f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d0f9601918d7b962d46e83b295596582d9bd33cf0d1f9bb7ae6ea34a7c6fada9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6f01c731abd80ba83f139b8e27b798fd9f2805ad3572369585d9937dd046d7b1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "32e0e4d173f43bb1cfa0073f59ab35c6b606e1c419bed50ed3372111451ad37a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "b36225dda9fe5c4cbf30a335674cc71fbee86baa3f5012e33ba760db9a3b899b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "bb416dca27d7e494114971e48c3ae4282f0043ae526e22f28a316807ced8579c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "6df75b1cb2dab6f0113ef8e6eda7369ccd139fab8f109036373aeeddab8b1ff3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "50909b1d8e5646ebd23ae7af794c4d44fc4f22973f36ec0c7d61869231adcb98"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3fbbe94abf9b7b7f63c26886408595863da12330774167e1b55faa08d7f9cc7e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4a03907230f5fa458439e610e5be58ee92ff10fe87b40d1230c18d8c9e22a05d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e615c26f7aaeb165b8e8c648fdc4314a8a8e5d759185aad0fa31956f967467e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d29c4c4182aaf7a290c7f7d0c8ac970a067d81a3695ca7a85cf9a72e566b974e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "eb5213d2e580a3119177a4e053ce5784179e7bcfbc6b33d203a8a888533964ef"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "677f4c3c1437b284eb34b9845d588a3c11a8b83eafedb9338a4db99b2c503739"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "2aa945056fec691b9b5903f22213ca344ddc4ef8b9c2bc248da34f64cd1abc6b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "75723738dd885b2fb9977c3facc00bbf62d43aa0d639751d61e032cbd894f147"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "257d22762003dd5123c301dd0cc81335739c13d43f121d720a3179dcd7850ed5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1363b3a9aec1bf998adbeba25d199094fd72df0ca373012aeaaa11911ee11272"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fe4edfa081b5f3a3923bdc6bbeee1fb6f84abb88a76b83cda5cb4eade4fb16c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "61b533b424fc566cc6217ea277af878866151369503ef3278d9fb31c2d97e53d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b01490134b8237b2527b67eb0675e1cb3449b7a2b9bcea9e3b8368eda28b11fa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "b27741daf4b3dc2c518cd321b924f2c79ccaf4a0089ea7236a6ab789dc2d7f4b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "69ac21eaa149195769d6a6cf3540759209cd24fd9e29a0cad85d03ea90d3f249"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "6e890bc105cbf707132d3231951f226bd9d31526af39ec884e310e0215dfd5a4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "c8c16dd271d5934add27639892307646bb54e3ee9caac0b9b01b8edf84b7f80d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "0717e1a3869d1df32505f9aa54dc03b2e69aa283a1fbc80aa1f3de29ba05aa7c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "72db408a6840bf99f11594e8d0f8e69409a4f156dfc61bb993c8dcc550048e41"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "455dd22753f9e41802f827a1088d84e0ae9f950ede3ac72cf8fb7321e025ba21"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "dc682a60e0aa9c57b348e60238acf0639f718c73fcf46a43e4831d8f022a78cb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c5ae1a1be4491c83605392c0bede321955de62e02c4b2a9a96d59601bc3ce23d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "37da47fe73dc7fb45b4bfb41e5272bbc84331404f29427fa151c12d6753f1748"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e2423ff169f3bd1d7fa142347371c269e837ef7c23e3b17ab959e5e86cb83539"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0bfb53cd894112eafc6be47dd328ac9da15b177a9f69b81229c575800f965047"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9162be9d1c69258a04b68084bfac6440c945f5b155fdfba25dee6da9e410e8ec"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "fa537fec3f6eccb3340802e360707cf80cee0e75aa9b0ad9b30ca317601279b1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b8d3a946aa2da9482cb69daf02614db34071f8091c5b6888cb9edc414ba971e6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "ddf823782723bdb9a633b18f0d9c0f7053e3122924230c0194240494b80a3b84"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "3466310d0a52f42e39136455561f0b80ece163f8aa473b053982e2ef5889b29d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1375010cdbff780835889587802d34c77f96a3e766d7a75754212335ce5c22be"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e7045b374d11d7a61bd34bf3e59bc5d4658bb955748ad59a50bdb0ebb76c628b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "4a8af28a5af81b4d89de7e39f5402f7499382ad8e9d8eb11512b00bc174d9684"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4008406992419e8da952d25712da67eb5ac7d337edf7fa333e1a85509cf3946d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "e1db2ee1ea569c82ba5bc6074c7ed16740365a8c5f143b45b1fcf56d95cea6d9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c4905ed7587be0042a35ab39c9bafa1e491d19130be168dc56f66c3d3e9ca7b3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "9e811866ea52b3b85faff0625849b153f94996c248e334e8e7fc3695104e1466"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "5f5f481e2f2fbddb40dee009f4fb452d9d52c9789c4a7a68bc6e6c680e7ff118"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "d63dd3790eb52a0a7d66f10490ffeac8e334802128e1454cb052ffa22f4c5a56"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c6dd40a560308b28c5c309c448f5e5ae3c36912d68e2277475e884a3560d1493"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2e968ef8ca052ca8927c48cf06621f07f48bebada7df450b71451635de89584f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "374c8eacc50c427f4c49869b57e1a3127c10545d5f03b2a05222d257f4563124"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "b6c55771e992dd9a2e8118aacb148d0c4b2a23f1f5b590084865263fb9095a77"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d418c0d6c339aa2cd900aa4f137ada5e4398ea23ffa40fb872c08b513a9a84b7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "83440b26591540db553ee8239c41de66c6cc316b18cbdc2be83e9e1a124dfc13"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "cd04010643d137757e0b3e3d5539d8bc7a8177e8635e559cc048184766931533"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "4e044c7f75e710c8789758e4ec5af1f6e4e5de5f10dbf47da890615f7f22b121"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1e4ff83a06eee63027374707cf29d6029a38951ceb9e99ce119b2e97ab0e0d0e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "82b2dedc92a260b7138e37d97a070efb94da1d2719b8feebf958acd84c8e3b8e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "41eb1f61966e68858f7f810502d42c30d6cf2fb95f29f271821d250f55bae374"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "3ec454cc65391f255e6dce530fbffee768e753e865b073728235edf63210a3a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1f3bf63cc656975642c2c208feceafe1670107da6c64f584dd3b96d0a1d9c9b8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "285002b3ad35ee6e9ce0e93ddff1345ac30d2087a15eb64aaa518b5770249847"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b4ce37da664e1c8d09ccd8b41d65e9380392e9f0368757c492759792c4313fee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "dae405ecfde1a7086ce0bb483a1ad55520cd4bc3b3e2e39e9d5998e24a16b99c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4a0a046ee21a50f35ceefa7ae2e6b4998ad6903fe80f51e4f5b4db085589ad58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "361c8062b46b59fa022bccf62db501b5c2329bd22fed4054de1914965390c137"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "de476a4e7ec25ac4c97c215c618e086bf2fb6066725cff4a065df4207b7dc649"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c33c389e8dfa4426547850021957807d567b54d373961dbcdad577d9e55bdfb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "7dab3be35d8e74d50f5cc767b6c9ed824dff3d4fd9562356e974fb6f97756f0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e6f976c884ed4b763cd8dcb00354da177d2c18be7600e1581ef9b108e26d29e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "413c12f1020589ec3421625a92ef7f48e42f01792bbd05669ab114f0e4409c47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b90a2492416e941da09f190e2ee48f6a4cb33371b4e2538dba6ebff48d614c64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "5245a3bf26fd1515f05ce53e0497a082bb6f3ead23141d215a50d68adf39a12a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef95050542686523e0da27d52c19d45b81be68137b91f06701e632f272e65712"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "31c8832d1023f66aadad802a5372adfd4538ca26aed384d373abcb8a8a8393f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1c6742d8ccce5ef2d81eabbeb8606952512b7dffd8c9a045157de032f798c25f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d581c5223529ec7f4c110eca9da3cc749e038a33ac5b6baaa1ade60aa1dc504b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "411b530906f9649de9183cbcf91de6d4bc91b19949fed85e3c78013dbea28e7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "68477883f806efbe60c1ef0484281d3c3d757d50ddea48be6dea8771e6cedb5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d96ebdbb6ce00423cf6ce0cfa5d88f3cb8b136337fc4f6a373697d9880de54a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "08fc1866315e1c6e2079fdd338254a20ba33aab55a42ca4b9829a6913fcc75af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cae15477dd94082cd6ee270ec5e2ec5f4f82c770337ab2b50f1299468901473c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "650ca39f07b7191377236772177c6dfbd44d958c3b9a17f26181217a9543121b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7fe3aebfc7b8281ddbce97a73ce68501815772d1c1e283d712d88a49133cfffb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b2b0aef79fb33d27af6fca313f2d6c91f38c378256d02d2bcbd30f670173b341"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3397cc45eceb71d154e18a61c877416516568476cff80257b629f119d01e6226"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "9139bf19238378c04d5d0e0230da1dacafc9109625e1de124061ca3aa713e82b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "2b2d903b3fd51f2e371d7aed07485ad703ac97714d3cff205c69d2b74c4c921d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "07474ce73181c7744875877f7d02511922da050a6310eb8cd645eec7a2788da4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7f1dc0923c1b3396d15825fa57bcfc59ec9a29f6e80451e5759cf9c38da985ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d969b25dc758c6fbafc2c6565770b339a215c27bc529243cc61ab9358682f380"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e292cab669ea2df096072a16868b20685c6f32ec5b292964015655c036b288a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b23f148b10dab1c8201f33d1566badb8e390cfad0f4d5a1076e211cda02ea26a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "21730738f73c5c5b148b6ca80c5b44e6811762dc97e54e4b021b7bf191ac4c41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "334a8662d98ec7941489fded603a2bc81efe7fe37e4f90f6ab0955fd585ea867"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "410fa3dc445207cc7fc9e7043fa63d57707468197952a660e14dbc99ac79fc6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "4922aa042d713bcff368dc73c7b85225dc3c70b32dba07fddb07a9e3b6c7bcb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8418229a5b6037678bdee35768d57f311e2ed224c02eecb7677dafc0cbd63f58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3ce76f23d4be064c022433a1d865f17fdc27658adb460b5ad8d3a446334a37e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cb0218fe8e26b85a022ac2025760a589341ad3685c82f36941d1f852ccdba6b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "91e889adbe86e5e0f2c1fe586e50a90d7cf115291933f67ec32dca49f584c77f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b3dac11c4966a19cc994cae3dd6f6885aae98b0fa3a6bf98e7259d870a35e2e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "7ae0bdba58ccd95b96642fdbcc08c5b0b7fb493f852aaeb781154eee12042397"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "554545443eb760d57ccdb2a756a9dc20a268206fda77f7fe275489da55045346"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7462782646732bfaa443db7c5bbfc19e4f475f2e96794527df5d4052ec07539b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "57ddaba23e9cfde13c22148b3e61ef40492ac017906faec86081b56ad2a5eced"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "589270169dad7709a57611fb9d15ab20e835b5a5ae9e2a87da76fd183c722693"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bd81142e5170b90a199fb33df43c6b60be78fa61c3c5e03a0b98354d6471c5ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "181b9627019eb7bde84fcbfe0de6b5155bf2a88b591cb0e9c13b5e11bd31a5af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "070ea6ad709119cf7d31423d509e25e0e7cff6d5e652c01f18b1dec9831cc436"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7eee09c5d52e9a90d292d4f330f84c30a58b3924fe5e232d9ecde7873e94b7b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1659c39a86c2a2bb508538c641e0031538c3055471e89d893d343d0ccded2c7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "0d9d6498acaee35e72d070c59eae4cc999b8fba4518eb48b9186cb109ab5b02c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "36c964ed340fc22a85ec0b5d16105e5c9a371f30a9ccf3e0516e1dead7d6c52a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e19faf863694e0183bedadfa89be862e7a8538adbb73af3587363145f2b67022"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "436fe7c7c30be1893e80762c017ec5456ad44dbbd282596592fd277f610be734"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "e8e53157e8d2402165a579c860c43c5a61b25b9b5c3e4e5dd2d808e4d27722ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "7e08fb0d278215481f19e1a7497f7d16b7b026638938dcf5d1ebe91db503417e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f433bb58cfbcb9964ac30bcec20fa61728c3feea648d207658ad123d3313b351"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6de13c035c2d3bc349ef7d2644fdb4663d761f1c9bc24879b0aa5a087172916e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6696e084829754c2fee3e85a600e5f94c0e86ee94ab317b4dad626712537e0d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e7c75814cbdca50752ea4ce72ff74a8a14f9c039d67164ccbc22678c8e2894d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8d5f4baefcfc1b90d5da3e3671fb25f1de46e66d7e13b9a55130e1381a56e008"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "66708df91ef52833448d1f946bfbeb3d03a74858b19a6be855bee67d2df60d40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f412ac17cff96421a24d3d5f518ed8b0fd3a697f0c502c3fa8546f9f7ca2ac01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "63015997e0478d771f5b52fb293aa0d06d1e2441efacab666468750f867b95ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8faa0bd0a63abf9e4a15975e50552bf70382054f7a88bf60c4bf662b2c9084b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ac9ada1965121f33ff25a2e193c9140ed88d4f050b7f951ba682d3c16aa22ca2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b05d8abbae51e9ae009b84eb8ce66bd2b4e69334d82253ff9c0755bbf9ef15a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e4f84554733422dc595c4dfab3c880bf4acb0ccf732a521fe98eab87bd54a06f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "81268a7fe864be3f4b982ec4a796b66cea3556f975e228cbeb3871030d37af96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1bf4285bca91c05e13f868d00915ca0d0df6912664f43c1bcf56cc2bd0a4313b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6f74fc5cbbc5b1d67c4c7f030657abbf8511d54292cd403f823e4bd9574d04dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "6826ea261131f29b0e7459d5ca44f40504e0ffd84384436b5ccc959d0ece4b84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a5f18910667891c8d2f645cedde4ecf0ebba40f1ee119c6bceedd599ca784648"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4b4c4e27d55f7cdb471e603f8433f78235fb6a86810c7cb1fec198350c378e24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5f2ae94f2395540bd8872f8011757afdd16b08e6cf9130d319798c1e239519fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "cf66990c6c95224782fff685f5998eb411796a7d49e28adc7baa79126acc701c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "fb3b06313bc5e2151beeb9e4806665c0cdb713cc0d8775f8595f223186049cea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "05e63dd8691e841e674ace39a3a35e1b7ee763da37b9890b8a780245e8010fcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9c889befd61d27efda91f8cacedb055e0b41c45fcabb88d6e5b16675fdd49ead"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "6d03b6d157ca92d3782e79c8858b8f7c28d6d3e41c58b5b9f8c6c6228a35a4fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "68489ca0d4571f736a5370a52e51da915c60e209db2e71174de74b6410bdfcce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "350058c3fe7ed093e0049e2597e9125dd8f6b420621c8f4be4b22d19ed361e94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4a2700b7152ad4fb89b7b91d0c715a6fa58c801456ecc9b05e7f10c73f9731f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ed6b20860dcba20fc2c727368d488d2c85e74969808ac9368cd25547d640d147"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2158cf81ae06df1f7a7a89b283312e978bbf88435acb4443335213229cff244e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "87912b2bdf6339f6179e34b1e9c0470d204f2faea71bdf1abf8df7c9e99da822"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4d11b716399c7647827f1fd6501e0f72846496e432530dbe217ceb3fe3ef25a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "3895674d8083516bfe06ba2ac1fdca391c66043e52382c5fad921ac97baf0a4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8c9584c109a73dd9321514e5afa07e78e6eb264e899b644df374ca24955ed4a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1df866c843a08ad3f7bdb1e0c9c5455b8668455aed3c026cb71d5c871a957faa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "bf26bffe4caff1f85a5eb3cb7eb2a579db727a9cdb965dd93a3fbbb1310a2d84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ed18705e7c62b6a9aaea73ecc488f61cc54ef9714635632b13b8e61e29256539"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "50af3b58acdc7438ef0e362bfd3b55505a867d036721b192173ef7f4ec424f34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4a9185c5ac9107d15e8ecaf2ac56fe51218be6f426812916382633f36662558d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e2c79afc3c30ce50889e278704c2d24686db98b3f8bee6bcb2121e8881028077"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "16d3b525a6015c0a6cc97fbf692be2147efc35ff7b3d08987138c6564f1e8d9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0184ea32e6f67491c2a7db0b389fd1fa395fcffbe332da88be453bd2a54916a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "fdc64ee864c03b283c18db04fb35c93a11ad36b2463202673e1ccc97b04f0062"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "23b1a12781197d5d8ccc7991aaded8571c5ba4049a8d9763275b6f43455ff775"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef47280552f602e9baf279181e844471a9e33fd803dc655487133a2dd0a8dc0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "bdc3e33086351daa36faf88384f8ad1d2be247271f25bda681b5fde76e36fa6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "d84d1ef9971b121b0a4a80741962e3b035f2d1b2d5542cdd9cd8f8c3caa290bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "5c17b060041843c194c41bb6cb79ec96aa407f29e4988d8676b0ed0f3def0ac1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "002cb8491d4a52d5c8fd28cdf375d0199ef4fa9de5d014d751b6d97c2e357b3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "685125d950358ad9db696c5f93a738769c8be7123bc3e9c396737df9e9a0fd83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a8940b3f18bafe4b52ccb1e012572e83ab59dc7cfcf15579a8398ba73d6095b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "0d9e9006de4dfd5f0def250959a99e4144654cfca519df12a7ed0e5eeea1aa08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d6d5afead98163a4d8162005fc81c74f20a1a16a66ee3429bf33022c42930967"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0504bc2c68b7b1674ee2f4f9715f63bb7f7f05ffafb374e1d9ffee2311ab4245"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "bbca61241a7d5470c977c6a979f37ffabbababfbcce6b04d4bbbaec8fea886f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a3674a44e52dd7bf455d9029d4b41af5621dca2256abebc0acbe02984226a307"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "196678b86c35615325c9296b2cc6e724697b593301b9f8d97171dd7fb062ea92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "2580fbc5896dedf76e8ae1af6d33cc44480dd337ac8312b0e112f689a9596e1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "07d0a698e20461856728bbb98f20767804be64bd7adb4c141695f56887914ae5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "127877f43df2b516483c5e84fad79c4b0f9f1300c1fd65ac30ebc1de10f68572"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "9a86ecd5d29d82f925ba6386417023f109e615030ec1179e2ab2a979dffac5ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1a57555afe868396a607b68563eebd3f4841afc9d42e19e058b66ed69f17b6eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "da584d56225e6647e0848f48f76a43f93365b361b31e3362e2f0b1974f4d04a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "c81942c37d5ba125201e77a281798ac6be2b3d1e0cfee13ea5cd7a83ed422539"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e2971443adbacc24f246506effec0b0478fcf2aadadfea56fdb945b385ba5f1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "89674128b8fc2760b038446a97b3c5cb39db716da35a76848f2551953b63b688"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "2bcd93fecf10f93a65fa8ebfe9e1c14fead4b5aa60a5dd36760dd231eab1cf49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "efce311aaeacfd31779d869549027c811c7ce59a2082c949a15c21cc638b933b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "16c7bb25f8e8369a41ebf46902312bbdd6479bebe981a82803788354ede97068"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b393eb438affa7a846d2262e2855e78a2e1da71e3b88bed5a445bc85406b8f3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6ebe217e90a0dbfa14c3932b76d4609cf06175e8f8207623942abb7ceb2471a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1d0d7f3bc80f385207f6585ea5054f8096a895789fc0871121b27467e460a6e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d799be43b1a27aa75d3abf43f5fcd0960ff28324b9980c99a9002e97529b230b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "17b4a3bc8ace51f359621f16b7749ded041fffbc2c24a8c208160628de77d0b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "f55c3c102d52e7e0945532254c693927e1144d046822f1fd22688df35176d6ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a2481137a29866f336db568a3e39d7e1101ce0fd2a6de4419b3a5c3aee7b294d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "067a651237d2357ab09b9c0b3ad5632c614ae6d60acbd68ebe52b81b675ff01e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d3ef53cf3152303eeef22139d1066c6122764653d8d0d23c5ec6e30c89aead2b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e49f73056ba6879dcd684ed211d684745c211fc5dd2562969cc5ac51cd5a73a0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "03da687e50bd1d8f40307e533af5481d54be21b410bb8fe4fde5907443eab7dd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "25ca5d20d3d6b4d1e8356475c4d92d7c0e40c210bfaebd3b0df58aa968cded68"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f70f9c2fd3a7a21b3dc2a953fc9c00e4eaefd297a5a7289b2c88e47ac0ff44fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6fc9e7c83cc544ba18f1dfe0832981c93e5133709197a1c7bec241874122caa1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "222238a9dd8dec2b913b26c5c55acac8cc44c65651808a1bd57839d5205a2a44"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b3c19f490074286776c65f7f2504c9ee745f586d323e236dd3fb3f287eef78c0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7a0e360ac921d5d6540533e95571230731f1ffc20a2a2aa09f11de156d53eab4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "686092b177ad49080adf04dc6b165da89e44fe9490137b25745ad194177bc368"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "79c395ef232a280234e7ae0212c373c2238dd9fd4bebfe1ae5143d65abb1abdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c3730cac4f311f6b1d8d8c5b1c026ab3450cd5d10e50a707e068355fc86ed00a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1b9be99d95dc504154ac80d10af9d5fc25ec92c4dc9dfbe259cce853f82ade97"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a11fdb48f481e747fd4a206ff5ce915c1a93f97925ee16607b0cb2972d506994"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3348d66a5b441fbca450a6c2fa870c7fca8247ff3329dfc9a5145f4f179dc65c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "322caf1ad82758bddfb734fca4056ce258580f1b7724357b143cb3751801f42e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "bf60a478fba8cbf16fa8b2beb1f6634a799db0d4c2e5bad2dfb1c2f2040392ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "03449ccd017926be8ac6e88a98cbf81fecac6e1d84bf51a80126dfb24f8f6768"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b0f857ec836447de63562c192ba78b7e6fce1214c80b8bfc3bc28306f73e7b7f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ec503ab4914b8e5e91464cd95c50f6a520464f49b6df4c919fa76b0a2276d49d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f58af9586243cf2fe94bcd0a99ea557cef829ce3a66c976f2b647e4d956fab47"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f31ff02a352aeb93df080b77154f209da7775b484c810147fed08f2dc5a80f05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bc4243ef05beb39c9a82b13f7bce858aa9ab70ad52fc83f0616f313eec7dba0c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "bdc77dd5b8f399dca461a1a2d03ec2e283ca834ef75770a9e73ebe04355451e9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "fd877921c396bea0bf993d767fe9440958628eb30ab4fdc050129bff6f9a9834"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1aad80c4b45c9fd2204116991781f988cece34f0c43132e9d311b62dc2899b82"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ab312ef57e71e20e45c67b072d61d323c40cafeb3ecc6a231880584b40c92d28"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6935347240e1fed20b7f3efee678c5bb4bb1512e2c341590e28c91b446818f6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4ec08a4a82b0b29263b5b073f40c492cd1c238325e4a47de04dc11f621d3e80e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1632ed7994e2c4b0edec50731c97af800fe442185c4e6fc7f1b1ecb4aa0fcb2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "da4000aac82326e4f43ed74e214fc1c7c220bcb70a734ef972fb27013fcc6e7e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c4ce90d8165312712be77cbefacd363c7d0596d83252b15475b3d8df4c4148cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f7a19561eb5de400d392bcaa272b9ca914c522a472e8391c2982d9561573b963"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "120f9b7054c042ac44201bf00954dbe4d55a404b95b4cf4647ca8aca2357c143"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d84389965bcf53cefbaa97b828b98ab58cebdb4d18109fe5cfbe71ad3f30bfcd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "98a73fd759b07673797a883c026050f9ac4fb2e1c8bbe51113667f9d25bccad8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ca62d2f6377a0bd13080a19c5b029f6e024514a7f7cbaee7d98dff5afab8e851"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b9916ed4097e6ec29d93ba1048ee5b4c0bedb9dbe5248d3e4359474894b3cb20"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b48500c7f8e1eb09ef85aece90bab46d0981cc4cbd6bbef5e6e057dd910e2d15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "5baf200a5a104b73c4d511d31b05bac0d637f9c703fa1516859cd8df879cfe1c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f517ec87f78d21dbd197573592aa8d57349a317f1c8915bc8b65be7fa96037a0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6658daa15285fbc165fab4ff3bda0f66aab6f7845021401305afb15f4353aeef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5b3728bab415731a8a37e804f89f2076529198049107b3822faf24a21d51ef41"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a017ce9aa1ecbba833d48a6e0b40113d865002e8d61cd316c1724fbc0d03c9f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9125866a7abd25751ea0ea21c1ff429180495b55b98925e1230198ba445d9f16"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "25e66da4037f63713773d176ace63c21051c082ef4ed471ac34ed95109c79e50"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2cb6f3750e4c30633c7343f83448eb2065d084de5636af9a39fd1fde4bb24f83"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "3369f08d22c4a4016c937cfe45c7146fa62566d858f58b02365b6201b2803a59"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ff0adbcb6aee973b8270d156940ce34360949b821bf68336b9676ac26224b9c4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "661886b7ca4f9f5481df511af4100783a2552d9d9de3b93d420a4d4c5b1c9929"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d5e4c791a7f4d7071d1424fce55f8f80d65996082226ec7a46080351aa78bb5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "037404cf3f2dc03ca7a65a0c4121ad1e69ac8bdbf96fdf48376cfac923835b0a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "546f29535ef7135498d4b0fa1b1b09f0496d4a0f392c32ba121ffd8fdcc3a06c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "9c28c1d10633bd3687c120fa07d8241f76f10921b2b72a9fb23f1d14f7e86074"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "33983e35ec695ed6bdf4108612e1aed6a53e0487651285d76dbc115324c5cfc2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "80271d667ad4caf46c6e44b68cc9cee79e02c4f5ea0435ddaa96422a32dc80aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bbe90255637d8db42b5b5832f7d90f7b09cfa94808abff928d0906c1d2ed9398"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6a19d2d2dd3ef3e3ef8ef00a9c59069d8e1d1aa4b63bac82f25117b4462d0e86"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "13927d266bcd29c5c0a8af8cfe46d6a2667a990d6ba3dce1c64f4e1c69d2e4cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "36e981355f6b1389cf2e61ec2f9c7ea053760dd95d480da13133ae3b678c261e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "70464c9405651168e735300c90f9a85ed7b96d5d51b1fcc926879474b6f28076"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "97bcf8b9ec0bd409da7e93af6519f19e37b9f9dd99376dcd26d7283a35d88b60"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "dd0fad1a5922c0d8e494e3c2ae3be6827c900f7e97848652c3650b00221d830b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "601c75671c2e8200b30678d647aae9038e16d0d9c4ad853658ef358332d26d65"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f49d73b6c96067b4186c841f7b5a4cbc84c65db3ad00625e28aeaf4c12b879f4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a018ea409ee1143ff4a676ee72d468fadfe5c2e3acdd93879b7912e20dcf9f38"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "943af2b3366fb413b08413127f89bb1763c9612c57ac4bc59d0b8a63a9eb90bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4bb6fa7fa231928b0c65629e3c5b05c2d7c5b1ece3ec2ab413d3435a702b2766"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7d638e908db0529dbc6c6c86a6c33deb3a1f217ab351b2fb3f6343093359636d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "fa4b814ab09c41ee4de447afd7a90c3b5675e98ebae5c8e77c9b406776f12e1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "35cd9b42ef3498996a9cb380a6d1d814ce0da1947dd5c58e0bc4d059b623e041"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5126e4f56a25290c2b8d1f330843d6c8591231ae413040062dd8b438bf005011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "dd046dc7b1845540c18b89c34c41ea18dc1e1a29085602978a09c9a3175212cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "4ad605c2d8ffadbca3d29353705e8c371915a1b0d94b58d670f2306d10bf7b71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "e0b875c5a0ae8e7c2945b48e749b375012f5d22ddb49a7c3e62a059165195f96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "02200c05bbec8791832887523523366175d5784dfbf671128a66a6913370f81d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f728b32ca36a0e6020da9ab1879387d0dfcb6c6e663d0c59d499a952986709b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ad33ab04276b5fcf2ab2f6cefd51335c8e8bcb17f0045b0ba9f06f26d49b74f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e05af8ec5f29ec8d104c977eef753a8b6b93715e97d78fe06faf84f68fbece14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8382b55bdc6f5b96cecc842ae32f63cd7234847e3efbf0b1eab1141422d0e987"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d3811617e9fe603097324687c57c4eaf8c4dc4abadeb18bd2b377230be72e6c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ba2005237f76f5869b0a485bc6626ebbdd8a6ec9bb7744fb4565b3a098244044"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f50f828ceea79bc786c199f580b5a7ec4803e29061d025a5cdcbd614c319c3c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2306bf599265c6e517a243d3b74d5d6f337305f7e8b79641ead03fdf44b0a547"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c19f80eb0dcef908b5b8d1c0ceead9933b5a513485c13a6d4a5c2e14a6fd964f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "887ccfd85a30616c0c58ad4e8eea1476a790a3da4902d7e18cc83c153d234555"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d8589f2c0222821ea9e51bbb819408ad278c4ec9552fdb5e36144bec5a5d24fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "cca3616a20e1b54effad66debe673dfe2141036668a22449930ff6b71ff7354b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "93408263c682a7d80429bb8a1289f16a598bae3284c6fc86a60d22b4912b2d10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b1d6a9cc6b275ae13f705df371e3c449a07b5194583764d431e016f84060dd35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ccf9b978862412809723d35b26edd4646ab5fc7249e19733efa85543c64c9754"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "05bc323d443d200e691312ccdd0d0bf3579e5f9626e218512600d213286c7c37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0817f661035cd9eb3f109bee3aa7a84a12b2c1fd02633318d85fe912bb95f2bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "0b578ed8b90603f4d220b70c0dad81ac23800c342f22b2673f812dd10380c4d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "72ad3c69f07d577491dafb9327197b6288c43769959ab05092feda3dc2d9e847"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7cac3b205e7ebf2ce1c507266e822e5d3a2fef248f0089804d400510fb345427"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b4cb07bc41adc8d4b5998321531be4ae3d153104c216c34f243e579b83e56538"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "64dfcc09d1a5ce5859b8e2cde6ba7460bbfac495944e63ae388936c83ecc8c38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "01cb5a933c043b54d53fc63b8e29eac9c16e6d03e0a002909b5ceb112a3e7dc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "15bd9807f0d9176127fd28fd396e070e2631caf3de50d06fee6faf1a294c443d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "599dedc127cd124ea5507968205e5dd8db7fe8c0bbb31a492e669083db3e4da5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "aaccfcdc786772f2b4d2aa33267ceebbf963994c86a236dc6282e6f7838a2d1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e8f010acd8914f1df8ebce4b064b06402c3fa87884b19c3e18cfe5833d2625df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6db593b663e49fd7e95bb998cf04f65f9c0902bedc188c37aa489bf902ffa137"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5b568f1e38749f0ed5411373595e07a0693fa0b2f969ac1d16c69ac7aa29a0ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bfb0403a6bab602491597d0cc806d72d85f9888a3ff0e83530b7ccd85e091bfb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b927ab1aa4e7983385acc77ea780fba0d572f63eefc6390979d91cbad59c9762"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4d31561de088e89031e458eb788e2a8b6651aaa7f385167155e5abca5619a501"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "572245ea36afb7ec7caf475b6295856159fab32d3e94f82b5369056ee51cd49f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a32d309b5894248e4d80b75a586701b27d54042a070a9ffd7864f1b271f794bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a6d1f1a2207cfd2fad701a90719ee5524033cafbdb322667eba4b62d1d8bb060"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3bc1cef645542b0ae6081cf462bdfc25d50cce04d5a7bee4ff121662ad4f1d58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b6d3766ba4b37945c5275821118b92d09ec4b01374a22ced66a307c5708cd07f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5a211df68939460edf28780d98bd46c8cc286f68529913b2cd19e94781355d03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b879b38edf501952f15b9efde6a19d56a62a47ad2a23b1dc20754f72b33dbbf8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fb629ba8ba51a628be9565c72bdfe3b240d645702a4b48ab2ce674e6516236bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f27c3d176194e9691be59e6d30c8467787da0474a01032d6f318438f7c051fcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bd029c0f24d0e40196794bfb3d44b13a34924126946deff31d7158de4ff96760"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "80736a8cd4ed89b599c0c2c3fc819403ce49518f93bf0ab1e61b3e3a3747e043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3161c4126c5a24f2d63fb399d3abaf150492ee87f0b91f55c723fb4433ac1b68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1f44e818e546f638a3cc25ae2a674785fd8c255ce1c7dfdbaca673019a877748"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e9617006ac8da9b95a8c7536396a71ccaa5c8c0ee9866e1a23b68e3ff81840e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "90713b6980f809036f9bffc19a765e7bccc0a06ef1c4e020d9aa1790900b944d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "972852ea83791ac60941df16a624c05bbba193ec9b9558f03eacedaad6e27288"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "170081a70beb293b27ad2024e890e5fb342d2585cb8dea99d979a6e952a14431"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ffd149c2802544665f9a83092d6c37cc52083f9e33ddc80b2d2c8ab75a0c457d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0f14f17e08d3bfa3a8e8bc440ec9543573f23ba9b170a609d468fdc2c0918d7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "18609288798de4ad1e664829ecc51fa4d627aac44b3df1a5552c2cb8837e50bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9ced1100fa545cb1fc2d759d845a14f6cb44a44baa82ff8843475ce88742981f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "56212a95dfa3cbcbec9bc040f829f7f9193f441959c84e87751930be2ef01b23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4ebbfe62f0c2986b5af1317ae1ca3c69c4dc9fa5e71830f81eb1f83cbbd2cc5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c8e7802d5bd5709e5997d3d8402d664062d0bf4acecdf1c64d8ea6ae87173fdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f07490d6b0e5b4e6d0d65b9679057f3c78ccc2f6d6cea3902b36a47c1ad69c68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "0b83ac5a5c26ac4e819c7dd66bfbb644c380cafabc9e4966c0441a4b8db71b79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1a6a520543fc30fab1e9a827e44defa4a4e8b9c2b5594ca74db49c585bb514eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4eabd0507943f2c76867f0670ae0a11a4b361cb2d56d88d2f23d83e93d47e1d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f2e42d947bba45db9ca364950e3a005d06a5568643da130763f4d4626713a63d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7e7b5b1d8d9e1ef6e73d44fbbebeb123d25221c108c86ca0c97636de830f1b85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6c8e1561c777c9b18af2e5f499b875804518cdc6e0ced744f0dd8977b275dc46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "305867fedfbc1f2aad8abe488ddf3e2f8d691de02f6b25803b84435a903f9b58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d8ec07ab25fea1a1b5ef5dbb6a27a1ccb5699652176d51b5609c321767f7525b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b3fc1ff1c6ae37fb1e2f2ec64667d171e481e0189a2c5a8547f26fac34cbbf16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "617c150a57938fb315966da841282a655ce7fbcd0f8b2c41fc44593e7bc2f3c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "014ca6ed77e613127001efcc67176c6ac2545c29697a818efbc150b6290c5fb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "67a274c355998846112679a9a3df4476a14ca4836981f3fc81bbd3e74f7ac684"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4bdc8201ed0589e885fed4f9ff2f1442b4e71e4ff957e4637c0940296d3eaf22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "91cc4280ef36620b79a20b88bd893adc3f00299fc169202b6ff64a7c3329260f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "33b0dc47969f6df59bb4619a61b54937e2ab01ac9f6fe94b1181121e6f226dc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9a12f7b88c138d09b4ceeea1a34597273509c3db45e3cc3cfaf58f8981dc03d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bf138563062daa80d416193f0fd60fd4fc2368728f1b63055ef142c4aa3ff44a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "144b003d9392b064ae1cedfa17e1aa4494d4bde0a68ae31cb1412dc9048a8842"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7ea47fe29534ea378ce0e58e1a1bf5f8fc452bb0e3aef0243160fb3e99151cb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "36947c46a3f4853ce4f31de5b01ce8cf9b1bf2491b5c4d3f001fc88815875575"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5ee2238ea7cb134ef6a7abc9e21af083e5a77da10e3756f455dd74dabdca2772"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8f48d5d2fa7f11d7c8090f3a7743fef26069a73cd90b1d19efb1860cf220caee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "17fff9e92e29717e21a489e1f9d024e60732528880643af21ec1fe8735520ad4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a9b141746d348a38d6efc062778ae56c72e24305e01ee1fe782524e29006bb21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d948582baf0d45319aa1b03b3f1ee6ef452a9953186ebf407cff36a9407d9aaf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d9325d91f51a3e3dd5f143cbfd7605b4aac0c530b933a268b67f59b272bf6b2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "943d5653df28951afad104b64aeb5c410c83b266e3f9242c31524f4274788172"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4d35ff8ae7a8104a77d57ab58d4e1ad12de618e4fc8398700a9918ee231fccdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "06a171df8227c438c0368aa7521e1e18e0aefeadcc1ff231b41757540064377f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "22e4f34ab9adefe4b7fb850086504632db090757d2bd5eb809b29d2b60997d5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "dce999ffbd29eedd728cb4ae831935e6e838680c25fd2145a0c9de1f2b571433"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4d5e5082b19e5afbec71bd03ff0e8bf4c44c9ee7678e42de9309c49cb51f88f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6bede5df69b7825cc90d0fd27ba8de0486b43ac786d5da43cfa29d0b35b53887"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f0d653b84222b3f16f85a0113ed48d6e3bec7d66519a493452515389c616dba9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bf3e8fa1ad57c0f7951aab9f625f99e71b83bc4bf05adf2c5704b34a7ece01c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2ad3f43b2dcad53a568af8937d946a713e1cd7ab7d30981771f9c4559477408b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5b36ae62e6bf5030ff2fe392d876b54cb03de6ff202f306b22ee0f1bf5c052f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a8d195f66957c7c2aa686dc01a6b435c1614ad4166644ea416afaa00dcdda541"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9bf621bb25e5ab8f0019ae0c846301a4e5ce3129e97320204f0ee5dcbde8d736"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "125d24d778d9ecd547d8dd0a73dfb3f05c088aeeb31a1202cb9390a1e8bce033"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a36158f3237850a6104f16ea239f12c52c6f2d60ed12bd7b3669c26c219b3fe0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "79f55f6b44e2ec220cb387bddc648d4af3638dbd3b1d7b33cf13f4769c8e4b38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "795d59adb81386f0045f871c021b430841e2f69c02438652e784811336aa8ee6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e57062e66ff3139600af2023419345b5941aacb2a071da4f705e9e4365c3f867"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a4f7ce2282e3c331c7c0e7d5a9663d643406c9fea67598201a2c4012682e49b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eea0ee5bbd572e0d4d9704a12987c46ebe1b0d55f0cd002d3a6356b7c7194768"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1f19222f99c5a8495c19e27f0d34099ed49dcf71420d224b214195ecc62bc9e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3ac47eeec918d46a1459daf42988e172e25b401ee381b77a71efbb577a8feb8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7526746531d42122bba7e5381892d48e9710e40cb52f0ae5fa9281ce972eaf60"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1e11f7fb2ea0a9087bc584298bfeaba5613ec4c8239cb4aca9d3b2006455263a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7146d11263067316abc32f60026707001e97b6a620ae8afa3a2b85839a4f8832"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4fa24ecf819e4cb01af69d2314681712c50964a82d87d3bb4c2bdd17dfc34db4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1353ddcc42f6cc66f6ee08fb22c45627b861ed3e4084933f843a715174552482"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e4cb0df157e96212b565a79eb487f7935dd2012934b9afb446b063b4fb85f18b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "dbf527a74a2b0766b57535042ca6d6e0b5591ffd06e01921a67b51c5173dbfe9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2f3d6104b59b917948f6b3b5bbccad960e93c12a4a83631b9b07f770684feffc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c7c45ea305e37e79be18e17904f16068831a77662126d72417f1e80773e8ee38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7898f08be043d3f006719e68d71f09f79bf473ae34aab19b3ab262a1a4924052"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e40d31e2e6bc567267446b097ebcea9d413377af76000d66652e4d632365c314"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "de1da4839c51a07c0b75aa530385b88642ac062916ccc80f5211e87be1ef6deb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8b75f0cd7eec17721b235585540445ada046537f6ccd6a83a185ed415848a149"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c0fa2d6c625517745a0607ca6aa7c84a42e6d316115b673a4fa3bd003305fa50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e6ca692e7a0781ae0586be9e9f1d91f62e4cb14cf33758df690a986263b05b78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "aa264d6bc47cb61cab2f18705bbd938b9f7fb900172521e5fe5c246c409368bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "85316ed101eb192471a1052c0da54c0d638c288391dd7a25dbc02296be7a296c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2d2ec1ba2bec477e82dd86499babf318d7d940a9206a129c9bb756e7a3f359d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "af2e4f5f603c7136a8b15ec33f95c5d442358f8741e0c0ff72ef38626d97bf90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4c239e64009d24898a9c79cb139efa31f48d0b58fb1f6aef4aa7fd22900f7cd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "75d35f3b6edd9093e458734c307f849062eb65e6902b3e323507490e59c524d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a03678184645eec1aba25cfcff50cfeb2aabb4a92538259f3660577bff0cfbc5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d0e7d2346fed92a3384390cdee8ef0d40c8bfbc04cbdbf8498db4c812e6d6001"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "360cf65b8814569a997f0b42e464a26f83aaf6edc6995afaf988bc0756ca465b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f297ce8a5eed5adbbe03c30e6d4fa5617eb72de72527f5dd59ff6ef326498a5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e0daf676c0e64e2b4a7d1924764f28a774aaa8a5346a3c5def913bc21b5e630f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c98f6cbed970cf184c9dc2280653ae8d9a3a858755c846fc900918a64577e2a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7c27b56d1d5bfc83e7807c3aad9132f2a8b2c80c663fb326c18f59420dcdf444"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "57b2872d1e919a6226c26a8ed425acd881700122e9f70dcdc7321c5085ded141"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "16f6b52df67f622ebf94c8c8f6ef7f84f9a9bdfbf6ddfd1e0569e14caab40367"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "16fed94e9aa03d508e8179c8ed5c449edf1896202c1e024348851b8f46ef01b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "afcefb117b45ea7fb0980b0c6ad1fe5ac04f61332b9ce5c471e6f1af1e8b43e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "55eb006cefd5749074cfc1558187b64940f1a635eb6564521803d8e7b28b4857"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8fd1fdca9c14c7e8ed9bc58c74ac97fd20aa8ff9036c5be7e915846c058f96df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "87eac59ac6653b8854b609b0b5bdd7e48c100ecf9a7b2129e0c0d495672fd57d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8d92a9fa28640997109579f6f4a0094f7964b79ef1bcbf937c8939867c13e527"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "489b736a717f881eeeb4e18a788c87c4554cfb67f03d6e5f6c4edabd28eea4a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f46d1d0bbfa3e99ffe3621182c2542ac3fc5d252309c6bf94be044af7f98f7e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "49b37bf6b7b3ec563ca55f915efb3bfa4164190d02d26659b475aebb35c7e063"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fbe7a013fb10e9d3198f607569a3a36e0d75db8b80a1246652d8b0af20e75169"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c415175a6586c0d8c24e81b0e02687009ed725b0020763ac0d4d884ae578e4b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5983405fd9dc4a016f9d4b5fe0e19b8e249033e1b88047695d52607c2c16a0f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "612fb001a47dd7f5edad06676a521d2909b49d55cce4db01bbd5ecccb3576f57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a4569b9389199160aed3930a80f77dbb667dd762eb2de60a779b9e2138fa3d71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "dacb1f9c1b83875750bd5792c146dc6e5eac6dfa6acb67352902052d68e21611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "70fd20eff5be6bdd3326cb8c2627cdbf414ba87a77b437e1782b45ab6ac4cb35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e491cd65ea0b67d1ece6df623dfb58a766bc6b4ea5100e41dfe49cc8bd3fcdb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d078460c57819b4d0d1c3f5fe6ce73eafa2033f8d7657fb70b1168a533b2b25f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5cdf5c0a903da2d995101012b3fd90c88ace027478357c156176148d93745450"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "eb5d3e145e834d93699189d88a51243c9f08e43c1385bf010b06c239b3dd46c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6d547b157dbad1ee259c650d721e30a8873f000cd6e3dd0256f4a25c4d348bc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a10445f07089ba8c8ad92ea0a70df62a526acc7fac60b735787c001e309f9ff3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "090abf148986123b7f0b15d77374092ef7eb090efbdf48bf44822bd846142c08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d5d7c82855b9d6aeac6707ec6eeed005e90f4a87a74b3909aa59e07a63b1fc4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5f0a48e82eb69fd2193b245e5628a9724e39df758981eebfaa2138551cabe9f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ebf30a181d8315b098205ea7aeb3eb92bae50f7f37700092e40af7bdc3210574"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2041c1bcff893c91e2895180071eda92459aabe6fe634828aca42b782689f75b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5895afc9817af940c56abcf036ec8ff7b9ae03c1f658ed50283024c20568b105"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0edbc421ab05fb39e1b71fc35383b3d550a62c95a4abd293053a7ac9d8745eec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cdccf28ba3cc22d872d050dda3f102e6320aee3878b46348ddf987609d6bfbe7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "be51ea950d9edaee840f0cb1218751ed8470e77f72bb8161a841e8b8b2edcd3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a32c4075650da102f824e322dc713ab303fac0c3363c0374af2748e757935e25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7f4b1a37df4b8dd077a4c522bd6e5f295079a912dbc6cef017c195d7938ddd10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "62fc5c05d9887edc1344f83682f3c6649e7a6a4bd8f94358d28092146fb161f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b6644272548ba651268e6f84ba464585ad313e97d87eb7289c430a17748ba29a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "81206b5f912d7519013979abff703bdfd92db0e72b0ef1e93eda8b60c707426e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "986212c4ca276774db46a551545a6234fbccabb5a5b35bee13f633e22eb45855"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "5ae09c82b34f3db257bdd94e5275314a2c4fedea39bfeb4f73acae00d0cd4ddb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9353795de2e8ebcfed23bd106079ef6d6b0dd5ff96b395056cf30b5088bf61e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fc12ef83fad73825f52ad44468e4e4feac385311fc5054b122b11f6f74044c83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7a652556d591bbefc06b43ec1b5e9eae76f5d0aa6f97ce57b761ba655b96dfd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ce58a1fe839457a8094a2a7def1761fb3619e171a79560dde428359d9f505566"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "0ecf55e3725696d8f4bc52431b5fe5a63ea3eeaf1907839d411ed04ce8419ebf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9fc91cf1354889ec2b67f1553986322ee2bae0024dd86103da264fbda9d9c7e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5269ddaceb4bad03b6f3162c92ddd5b314ce7c936cce4bd71dd5f2dbc408ac92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c29682e11e319e68d1b5a5db1a6438bb583b0b61525fdc8833de09387217959e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "60a7a01fcb7b3cee19c304c31427d2fcb50812132ba01df8e6721d2a22bebfb1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1a4e099ceb3379cb53e2783a9f5ef1d18b7cb716bea715db43a45f665473611b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1c0f8fba78ecdcfc7ccbfa223025c77df626f0e349e7301b7f3888a36d204168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e3878148d8a450894a0fe3935a333f84b2364eb7101ebaf970ff2d90823145f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "00cb11734a5e21e45cc98769fc0079a4868abb84a43c60fe686b654a9ddfb98a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "91aa2e3cdcff6ec2703e411344b62769648b8612db50bc9f2628366c8bae8de6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "39e713b9f51404b56629ba2c03941daf35c9d0266721dc0b5cb08c50d0f7aa17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6c5d39b1589758e2c6eb187ccc528418766ea9d4a06aed309a701fa658c676c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ca428b467f16e18f3268fb9f693ec1e2925cf1119302ad154b209243c0afd679"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d41f7470f967edc52d17067e26fec3569284d9883f5e3bac612367956ac9851a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "abf36a35ca9d3ec7795edca478c803d616ad6f0d1b067421d57b26f63cb72c0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "5b8e951a959928def41a140f3518566b818f827dda71a93dedee738e0db8b0fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "15446a717f8614aa8d0ee6d1fae4935bdfafd3d6495469cf49e13f8abb8e06fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "928b2d2b3f674ea10c909f72d411b8410bac6281b031056b5d890e226e21570b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "16c6f70dabc5ba230f5f0daf37c3cdd5ec79b7ae7bc8bcee0fdf1031786639a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0bf88739565b7f08a38c781a2e49f421a882a9afa81ea1a317336156a1786d42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3ecf158d1ce0468f1b4d018b1f03d4bfa67f52a770c735ded265fe2eaba08ac0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "58343706cda0ade14d3b1a1eb276bcd9ffa294e6f025f9bfa671256adb70fb67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d1acb626847b4bf4822563f4d529c90038adc29240214754a9d614bbda876f6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d39c41027acdbf1113dda57ee66199c39569ffc3af673a0b3f4797e4c6c35811"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "97685d2389b58535baf05dbaa92adb8b0704a1e0db78af281f60ff7d74ebd168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "66e0a0231587b0f897edfd6bc89cc0ef5263c37f1ec4a77b847852dd1fccccc3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c786f32cde2501b718b3e5aededf4d1adc7d13238d8ad40fbf7487512457b5de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "169586a393b78d5ac46fcddffbeea6adec94914b9cfe5717eb49ae98994b629c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "95cd0219cd0ddb6209349f18ca19549e059ec399c879900bc2924674ae057c25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ae72e413f4496306794c4bd790e8540ca67ec3a570100b6a2d13edee76087bdb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "cc32129b827d55206992311fba216ed7f912567ba55bceeb4e7cee21a97a8f9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "da81af4a26292003aef93cb08da414f3398730fd4f137d7ac09edd5e9062cda6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6b80a93f7cc683151eecf6e769f9d7cc195886dad21a54c2a70c630692292287"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0576977c5a4e52f255af1791fbe613ef7743728ceef8a035b18b2bd349a25220"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ab12f7632e3a639ba2309728c1e922a51e6ad7665adf1f0f38b7846a498ff4c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "55ee7d9b5621a93316f5a1cafee41256070e2c8948ab655d62f15cdbec90005f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9f0250f9e68d3f22cfb9106c0d4e2e814341146edc55e1157397ad19312a79de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "05925425256a3a5e0a4f337d3dd43fc4b405377318cffd59be80e4828fcee543"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7437a6adc4c7de8518028d83839c4fbf26e4a626e78d625968a8f213c7ad5b31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "28270803d084ea77d15d51f711fb53c38a2367d449a97969e9550983fc1a2b63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "69fd313dd39f6aa2638200cec1163d48639ecde21f834d0b0033120dafa8ebbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1443787fb038a4c6ec9032ef9d260bdd1e74cf8abc2654512aed50cba1b62895"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1186bf72e6657767c768a14db6b915f199fd82e907604e8f8afb1a67c3eaa5f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "08cc6797d6827cff8eeb485ec13c6361166df31891d43816ee5748afca068eb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "55fca17259324e689967ea7d82cb6e35e6c0b06b9133e927e4f2007893b74b36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "be5612dbeea96d69c09e31b5112558121b00064ba85041a41adab0d177e55816"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "744f29883c731833ce00790bb00ed6a721464c3010fe2190b579cd6698cece52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6ffce0f9972beb968ece8d23874d220f88929316fe81a1bbd5d37d50e42741bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "aa71cf66265ae9ce0ac7fa1d07be3733f480b48a92fa0a2dba96b72eba227e2b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1f02d316ec3df836c8a262aa5db765cbeedfe4f9616385aa8aee7ffad04fd36e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "560cdd6daaa3a348a20f44dd687c519fb0b4b51a162e855539a2dbd30b317f00"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e2d94183b0e30accc6d6c6bf437b51d612939831134e810b356d793c092bae34"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c116e0b14b2b321cb0a93e7519e80fddc5af3df400f9bdd1920913784821c5c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bdf5231cbe222e661a67812cdf2829e4f1f25b947714c24bd84559316d96a719"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b06fd3381a60dcbe984a1c1427848858a202c635a89831e0ef628c97af697355"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c8787d5a7df04f52c988c64c7cff9cea3d84be89ba65b13f14bca851202f8d05"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "47184ab746be0b88a2c2a6a418283fce68490f3c47a3c29eb3ca3b0b3dcf0a51"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dc8a670d26ced424fb47058f9e8d9f51dcd4383e86bb5e665f3f56d01f5d9bd0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "963ac970c070c834a078edb72f1f0c1de830917e72b0149ff733e9720f2a69cf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1c65c5a14c9def975371a5476c49c42db4a2d5cd23c2db638b9d7286e0f44dd3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "95c5c9d1bb6023ad115ed8ae8b75ab317c9142a5aabc7c0ec2b39d6950d9887b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ea44f57a56fac0fbaa388f9da2f0e2b071ad6b9b5fca99be4ef2e44cef892aa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "49d7fe4376a9e7411f9f45df2307b418d1766f99e95214347a721eefd916d244"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5679e5d51d637103396228e8719a761361ac15101edba1508550a9e50bb88743"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a0e1b51c0efb8f0864676cc68039c186c13b1f69de9a8d7b816b13a20155158c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "50f346d23f930c1277cf24c293492c10756208061c003ad86622157593a9d1e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4b1522f0913d209e03a72068f66477a9791d55274f656aeea90ab1208ec73156"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "01ff9dbd2734d11b5f273ee4cfaa04c8cf9bbee796bd2a902877ddceb31439ff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1211da1a72937b17402db25ee79036e39fa00da71ddc8e34ea923857c4ccfc73"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a2120aae610b64ec82e0d4845cdcda650834eb9911e12a77f130766148dd71ff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b7a661a78716067d060185740c7086e6d354244327244981efc34ba619522d84"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c6cdd7365d565a5de9d40e8d379d90d499eddc1c6c8110a884e07613acf70054"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ec2887d02d985dd0c782fa6bb6acf8b668c01172b62ed9528a0bb4e549d93a8b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a3d4d6526c3693e71d4ce051f181ac3441ee42409c4160a4300dbd7676ec2e81"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "77442c9fa68fa3d6375d35adf57c5621eef905aecaece965ca9920eedc8f9763"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e17ba02f62bca92562af42ac5fb6ce045be925ba460998a25f5bb24b0036465d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5b1e7caaa771ef24ce28f820a43182d44a46e193d54bf090c3eb0d59d0d949aa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "47fb47089120fa77dd32ffc1f0a8d5d35082392445759a9f0d21aaebda8b5bcb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ab95ed920017c83e85b71a2f3e6cee0c5746210be641228b1e900f8a183bec13"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e68b0aeb811e59d93e3adb75c59685e5cd1f69ec516d822c13dd2996843abd65"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "228c5c5e519329aebf09cef72062ba10dfa2933aaf25c25005b7751296e8165b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "25cb23c05ecdb47b854d94cf84e5ba1c8408c24a4044101f325e22f0ba7e7040"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b05bf9145cad8cbc02ea3fd651789fc441b74c066e7fecd1906cda93e5800843"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a1f05a7c84112c2621e858b3200938ea2d3323cec860bbbc02ebcfdccdd84f6e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3be1542f9365be539de5fc7f8109f332c4bb1c1b10fa305960278b6ab2014df3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "60dd62bba4775f562b14f0998cfd105ba6317f72c13b2f309cbdd9fcb442ba03"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a97c1779cc4ae0bbcbecaf3df4b24ec52c176e5366b801669047b395206f5ee8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eaede5d634bafc9c29d331bbe9ac16ec33763dae345ebc8eb4a42c12b33d123b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "aa099c828f44bf0f8cfdf8f4dbcbf8b521110ab8db36e61a066ca7010cb2bfb7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a58ac0e0b913c441fe3cb542bb367af85b94a6b9453b24ee50e8abb2093aed38"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c53ab19beb06d26473d492c04641315b32565f1159814f0a800bfe91b9773a4b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "83c945fe1276e8ad5e7c7883c00c76cf2b2343cbe86c17b15f99590b94037935"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "c7eb2080349acc3860fb9d7702140c19333234c3987f8870bdbe1e53ae703342"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "27e7f095fea12aac97a82e7a74502eb788e7567531507c05f9887da060697d10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "91f80741f4dcb2aedc6bc27ab7145818b998e81bffd12f8235043ea8159bf3f9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "51f5fde6fdc919c0a38aae1f537bbe3ccef479c8d083fca265dccbb911b2d7c6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "99204916b4ed12a8884b98570e06c9dba63d8d19bb15024d6ec7a74e421a04e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "041147f93c30dfe4146b1acbaa232c115d3a47639370d5610ec1d9ae74ae2c09"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1848b3a3bc8894b7145eae3caf9903226030ed8898f1d71a3d4c771a8a5fc4e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e454e547b899dedc442df595198f801d84b98f47cb1a3ff848cce24472429a10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e18d8f859b1b5f368bd09ed50c1e44bc4228b83e740d399e407b9fff88b6c0f0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "e582677e6a79c0e57c3fbcd41321e7082ef5cc33f63e237719817c0d91624e5f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "019056de5fe0a1a9e8456e2b3e4c3807478216b232905a1a22c320c100b18be8"}, #endif // EXCLUDE_SM_100 }; // clang-format on diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h index ea2027e709..7961213f2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h @@ -118,6 +118,9 @@ struct KernelParams int32_t mBatchSize; // The chunked attention size in log2. int32_t mChunkedAttentionSizeLog2; + // The factor to add to the maximum value to increase the probability + // of skip correction during next iterations. + float mInflateMax; // The log of the Sage Attention block size for K. int32_t mLogNumEltsPerSageAttnBlkK; // The log of the Sage Attention block size for P. From 8fefa2c9d142fb4e5ae0d558c504b70ab312efad Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Wed, 10 Dec 2025 02:31:29 -0800 Subject: [PATCH 056/172] [None][infra] Fail fast if SLURM entrypoint fails (#9744) Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index bc8ceaaf31..cf83bd5541 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -625,6 +625,19 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3) } counter++ + // If entrypoint script fails to start, do not poll for agent connection + try { + SlurmConfig.checkJobStatus(pipeline, cluster, slurmJobID, remote) + } catch (InterruptedException e) { + throw e + } catch (Exception e) { + // If the exception is about job being inactive, enrich it with log path + if (e.message.contains("is no longer active")) { + throw new Exception("${e.message}. Check SLURM logs at /home/svc_tensorrt/slurm-logs/slurm-${slurmJobID}-${nodeName}.out on ${cluster.host}") + } + // Otherwise, log the error but continue (SSH might be temporarily unavailable) + pipeline.echo("Warning: Could not check SLURM job status: ${e.message}") + } } } From 8cec2da3758234c8f0dbc52963b98fecf68651a7 Mon Sep 17 00:00:00 2001 From: "Brian K. Ryu" Date: Wed, 10 Dec 2025 04:13:48 -0800 Subject: [PATCH 057/172] [None][feat] Port fp4 quantization kernel optimization from FlashInfer (#9854) Signed-off-by: Brian Ryu Co-authored-by: Nikita Korobov <14355239+nekorobov@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/quantization.cuh | 119 ++++++++++++++-------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 7aacc0f31d..665ec2b42e 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -794,67 +794,102 @@ quantize_with_block_size( asm volatile("griddepcontrol.wait;"); // Input tensor batch/row/col loops. + // Optimization: Iterate over actual rows first (hot path), then padding rows (cold path) + // This improves performance for small batch sizes with swizzled layout for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x) { - for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) + // Early exit for padding-only blocks: if this block only processes padding rows, + // we can skip the batch loop and just zero out the scale factors + bool isRowPadding = (rowIdx >= numRows); + + if (isRowPadding) { - for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) + // Fast path: This row is entirely padding, only zero out scale factors. + // Note: Padding rows do NOT exist in the output tensor (which is sized [numRows, K]), + // they only exist in the swizzled scale factor layout. Do NOT write to output buffer here. + for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) { - std::optional optionalBatchIdx = batchIdx; - std::optional optionalNumRows = numRows; - - // The SF output pointer. - auto sf_out = cvt_quant_get_sf_out_offset( - optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout, layout); - - // The input tensor offset. - int64_t inOffset = static_cast(batchIdx * numRows + rowIdx) * numColThreads + colIdx; - int64_t outOffset = static_cast(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx; - - // Set the values to 0 of those are padded columns. - if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads) + for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) { - // Dispatch the quantization kernel. - if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) - { - reinterpret_cast(out)[outOffset] = 0u; - } - else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4 - || quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) - { - reinterpret_cast(out)[outOffset] = 0ull; - } - } + std::optional optionalBatchIdx = batchIdx; + std::optional optionalNumRows = numRows; + + // The SF output pointer. + auto sf_out = cvt_quant_get_sf_out_offset( + optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout, layout); - // Set the SF padding to 0. - if (rowIdx >= numRows || colIdx >= numColThreads) - { // Set the SF padding to 0. if (sf_out != nullptr) { sf_out[0] = 0x00; } } - else + } + } + else + { + // Normal path: This row contains actual data + for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) + { + for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) { - // Load the input vector. - PackedVec in_vec = reinterpret_cast(in)[inOffset]; + std::optional optionalBatchIdx = batchIdx; + std::optional optionalNumRows = numRows; - // Dispatch the quantization kernel. - if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) + // The SF output pointer. + auto sf_out = cvt_quant_get_sf_out_offset( + optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout, layout); + + // The input tensor offset. + int64_t inOffset = static_cast(batchIdx * numRows + rowIdx) * numColThreads + colIdx; + int64_t outOffset + = static_cast(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx; + + // Set the values to 0 of those are padded columns. + if (colIdx >= numColThreads && colIdx < numPaddedColThreads) { - reinterpret_cast(out)[outOffset] - = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); + // Dispatch the quantization kernel. + if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) + { + reinterpret_cast(out)[outOffset] = 0u; + } + else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4 + || quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) + { + reinterpret_cast(out)[outOffset] = 0ull; + } } - else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4) + + // Set the SF padding to 0. + if (colIdx >= numColThreads) { - reinterpret_cast(out)[outOffset] - = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out); + // Set the SF padding to 0. + if (sf_out != nullptr) + { + sf_out[0] = 0x00; + } } - else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) + else { - reinterpret_cast(out)[outOffset] - = cvt_warp_fp16_to_mxfp8(in_vec, sf_out); + // Load the input vector. + PackedVec in_vec = reinterpret_cast(in)[inOffset]; + + // Dispatch the quantization kernel. + if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) + { + reinterpret_cast(out)[outOffset] + = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); + } + else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4) + { + reinterpret_cast(out)[outOffset] + = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out); + } + else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) + { + reinterpret_cast(out)[outOffset] + = cvt_warp_fp16_to_mxfp8(in_vec, sf_out); + } } } } From df1adfbb501f3e8a1dd3678da1535d101e58739f Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Wed, 10 Dec 2025 04:24:30 -0800 Subject: [PATCH 058/172] [TRTINFRA-7328][infra] - Move half B200 tests to lbd (#9853) Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index cf83bd5541..071cf5fe6f 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2941,8 +2941,8 @@ def launchTestJobs(pipeline, testFilter) "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1], "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 2, 4], "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4], - "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4], - "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8], + "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], + "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true], "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], From 072f23600295c0b16193b52cfe7d707003eb11ef Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Wed, 10 Dec 2025 20:41:04 +0800 Subject: [PATCH 059/172] [None][fix] Fully resolve the tactic recovery issues in AutoTuner serialized cache (#9835) Restrict tactic types to those compatible with AutoTuner cache serialization and deserialization. Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/_torch/autotuner.py | 32 +++++++--- tests/unittest/_torch/misc/test_autotuner.py | 64 +++++++++++++++----- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 609efd1055..748b6cbe04 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -169,6 +169,10 @@ class TunableRunner(ABC): means. User can choose to implement their own types of tactic for flexibility, such as using a dict-typed to represent a collection of named configs. + The type of the tactic is arbitrary. But serialization/deserialization of the cache requires that the type is compatible with json.dumps/json.loads. + To evaluate if a type of tactic is compatible with current workflow, try the following code: + * assert YOUR_TACTIC_OBJECT == eval(repr(YOUR_TACTIC_OBJECT)) + tactic==-1 has special meaning, means the fallback kernel which should be able to implement any shapes This fallback tactic is needed for 2 reasons: * when the autotuner cannot find a valid tactic in it's cache. @@ -475,14 +479,22 @@ class AutoTunerProfilingCache: } for key, value in self.cache.items(): - # Convert tuple key to string for JSON compatibility + # Convert any simple object to string for JSON compatibility key_str = str(key) - runner_id, tactic, min_time = value + tactic_str = repr(tactic) + try: + assert tactic == ast.literal_eval( + tactic_str + ), f"Tactic is not compatible with json.dumps/json.loads" + except Exception as e: + logger.warning_once( + f"[AutoTuner] Could not serialize tactic: {tactic_str} for cache key {key_str} due to {e}. Deserialization may fail.", + key=tactic_str) serializable_cache["cache_data"][key_str] = { "runner_id": runner_id, - "tactic": tactic, + "tactic": tactic_str, "min_time": min_time, } @@ -511,22 +523,22 @@ class AutoTunerProfilingCache: cache = {} cache_data = serializable_cache["cache_data"] - def lists_to_tuples(obj): - if isinstance(obj, list): - return tuple(lists_to_tuples(x) for x in obj) - return obj - for key_str, value in cache_data.items(): # Reconstruct the tuple key safely try: - key = ast.literal_eval(key_str) # Safer than eval() + key = ast.literal_eval(key_str) except (ValueError, SyntaxError): logger.warning( f"[AutoTuner] Could not reconstruct cache key: {key_str}") continue + try: + tactic = ast.literal_eval(value["tactic"]) + except (ValueError, TypeError): + logger.warning_once( + f"[AutoTuner] Could not deserialize tactic: {value['tactic']} for cache key {key_str}", + key=value["tactic"]) runner_id = value["runner_id"] - tactic = lists_to_tuples(value["tactic"]) min_time = value["min_time"] cache[key] = (runner_id, tactic, min_time) diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py index f26d7bf81e..2323d0ac98 100644 --- a/tests/unittest/_torch/misc/test_autotuner.py +++ b/tests/unittest/_torch/misc/test_autotuner.py @@ -1,6 +1,7 @@ +import itertools import os import tempfile -from typing import Dict, List +from typing import Any, List import torch @@ -327,7 +328,12 @@ def test_multiple_dynamic_shapes_cache(): class GemmRunnerComplexTuningConfigs(TunableRunner): + + # test serialization of different types of tactics valid_tactic_ids = [-1, 0, 1] + valid_tile_sizes = [(128, 128), (256, 256)] + valid_cluster_sizes = [[1, 1, 1], [2, 2, 1]] + tune_max_num_tokens = 32 def get_valid_tactics( @@ -335,40 +341,50 @@ class GemmRunnerComplexTuningConfigs(TunableRunner): inputs: List[FakeTensor], profile: OptimizationProfile, **kwargs, - ) -> List[Dict[str, int]]: + ) -> List[Any]: # During the tuning process, we verify if the tuning config behaves as expected - assert inputs[0].shape[0] <= self.tune_max_num_tokens, \ f"Input shape {inputs[0].shape[0]} is larger than the max num tokens {self.tune_max_num_tokens}" assert inputs[0][-1, 0] == inputs[0].shape[0], \ f"Input shape {inputs[0].shape[0]} is not set through the pre_hook correctly" - # The simulated delay is not deterministic, so we need to return specific tactics here return [{ - "block_size": block_size, - "tactic_id": tactic_id - } for tactic_id in self.valid_tactic_ids for block_size in [128, 256]] + "int_tactic_id": tactic_id, + "tuple_tile_size": tile_size, + "list_cluster_size": cluster_size, + } for tactic_id, tile_size, cluster_size in itertools.product( + self.valid_tactic_ids, + self.valid_tile_sizes, + self.valid_cluster_sizes, + )] def forward( self, /, inputs: List[torch.Tensor], *, - tactic: dict = {}, + tactic: Any = -1, ) -> torch.Tensor: # Notice that in fallback case tactic is -1 if tactic == -1: # assign default configs for fallback case - block_size, tactic_id = 128, -1 + tactic_id, tile_size, cluster_size = -1, (128, 256), [1, 1, 1] else: - block_size, tactic_id = tactic["block_size"], tactic["tactic_id"] - assert tactic_id in self.valid_tactic_ids + tactic_id, tile_size, cluster_size = tactic[ + "int_tactic_id"], tactic["tuple_tile_size"], tactic[ + "list_cluster_size"] + + assert isinstance(tactic_id, int) and tactic_id in self.valid_tactic_ids + assert isinstance(tile_size, tuple) and len(tile_size) == 2 \ + and tile_size in self.valid_tile_sizes + assert isinstance(cluster_size, list) and len(cluster_size) == 3 \ + and cluster_size in self.valid_cluster_sizes return [gemm_0, gemm_1, gemm_fallback][tactic_id](*inputs) @staticmethod def inputs_pre_hook(inputs: List[torch.Tensor]): - # always set the first element to bo iota in x + # always set the first element to be the number of tokens in x x, w = inputs x_hooked = torch.zeros_like(x) x_hooked[-1, 0] = x.shape[0] @@ -389,13 +405,29 @@ def test_autotuner_tuning_configs(): # Test if the number of tuning tokens is clipped to 32 tune_max_num_tokens=GemmRunnerComplexTuningConfigs.tune_max_num_tokens, inputs_pre_hook=GemmRunnerComplexTuningConfigs.inputs_pre_hook, + use_cold_l2_cache=True, + use_cuda_graph=False, ) - with autotune(): + temp_dir = tempfile.TemporaryDirectory() + with autotune(cache_path=os.path.join( + temp_dir.name, "test_autotuner_tactic_configs.json")): tuner = AutoTuner.get() - runner, tactic = tuner.choose_one("test_autotuner_tactic_configs", - runners, tuning_config, [x, w]) + runner, best_tactic = tuner.choose_one("test_autotuner_tactic_configs", + runners, tuning_config, [x, w]) - runner_0.forward(inputs=[x, w], tactic=tactic) + runner_0([x, w], tactic=best_tactic) + + # Test if the tactic can be loaded from cache correctly + AutoTuner.get().profiling_cache.clear() + AutoTuner.get().profiling_cache.load_cache( + os.path.join(temp_dir.name, "test_autotuner_tactic_configs.rank0.json")) + + # No further tuning should be performed. + runner, deserialized_tactic = tuner.choose_one( + "test_autotuner_tactic_configs", runners, tuning_config, [x, w]) + assert best_tactic == deserialized_tactic, "Tactic should be the same after deserialization" + + runner_0([x, w], tactic=deserialized_tactic) def test_kernel_testing_single_context(): From 1c11cae54d664894841c4e70d849e8446d2ff914 Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Wed, 10 Dec 2025 20:53:26 +0800 Subject: [PATCH 060/172] [None][chore] bump version to 1.2.0rc6 (#9874) Signed-off-by: Yiqing Yan --- README.md | 2 +- examples/constraints.txt | 2 +- tensorrt_llm/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 208767b037..de910d1c3c 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.< [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads) [![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org) -[![version](https://img.shields.io/badge/release-1.2.0rc5-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py) +[![version](https://img.shields.io/badge/release-1.2.0rc6-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE) [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)   |   [Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](https://nvidia.github.io/TensorRT-LLM/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap) diff --git a/examples/constraints.txt b/examples/constraints.txt index 5c54c2a838..3a9178e1e9 100644 --- a/examples/constraints.txt +++ b/examples/constraints.txt @@ -1,3 +1,3 @@ -tensorrt_llm==1.2.0rc5 +tensorrt_llm==1.2.0rc6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index de19227685..c890d49c94 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.2.0rc5" +__version__ = "1.2.0rc6" From 2f030312a8ab3df3127dff2cc64b38d523275b24 Mon Sep 17 00:00:00 2001 From: cheshirekow Date: Wed, 10 Dec 2025 05:01:19 -0800 Subject: [PATCH 061/172] [TRTLLM-9228][infra] Verify thirdparty C++ process (#9367) Signed-off-by: Josh Bialkowski <1309820+cheshirekow@users.noreply.github.com> Co-authored-by: Josh Bialkowski <1309820+cheshirekow@users.noreply.github.com> --- .../defs/thirdparty/test_cmake_third_party.py | 163 ++++++++++++++++++ .../defs/thirdparty/test_git_modules.py | 105 +++++++++++ .../integration/test_lists/test-db/l0_a10.yml | 3 + 3 files changed, 271 insertions(+) create mode 100644 tests/integration/defs/thirdparty/test_cmake_third_party.py create mode 100644 tests/integration/defs/thirdparty/test_git_modules.py diff --git a/tests/integration/defs/thirdparty/test_cmake_third_party.py b/tests/integration/defs/thirdparty/test_cmake_third_party.py new file mode 100644 index 0000000000..6ba7389fb4 --- /dev/null +++ b/tests/integration/defs/thirdparty/test_cmake_third_party.py @@ -0,0 +1,163 @@ +"""Find bad third-party usage in cmake. + +This script searches for cmake function invocations that might indicate +the addition of new third-party dependencies outside of the intended +process (3rdparty/README.md). +""" + +import argparse +import collections +import logging +import os +import pathlib +import sys +from typing import Generator + +logger = logging.getLogger(__name__) + +IGNORE_PATTERNS = [ + ".*", # Hidden files and directories, like .git + # This is where we actually want third-party stuff to go + "3rdparty/CMakeLists.txt", + # Historical use of ExternalProject_Add that is not yet migrated to 3rdparty + "cpp/tensorrt_llm/deep_ep/CMakeLists.txt", + # Historical build that is not included in the wheel build and thus exempt + # from the third-party process. + "triton_backend/inflight_batcher_llm/*", + "build", # Default build directory + "cpp/build", # Default extension module build directory +] + + +class DirectoryFilter: + """Callable filter for directories. + + This filter excludes any paths matching IGNORE_PATTERNS. + """ + + def __init__(self, parent: pathlib.Path): + self.parent = parent + + def __call__(self, name: str) -> bool: + path = self.parent / name + if any(path.match(pat) for pat in IGNORE_PATTERNS): + return False + return True + + +class FileFilter: + """Callable filter for file entries. + + In order of precedence: + + 1. excludes any paths matching IGNORE_PATTERNS + 2. includes only CMakeLists.txt and *.cmake files + """ + + def __init__(self, parent: pathlib.Path): + self.parent = parent + + def __call__(self, name: str) -> bool: + path = self.parent / name + if any(path.match(pat) for pat in IGNORE_PATTERNS): + return False + + if name == "CMakeLists.txt": + return True + elif name.endswith(".cmake"): + return True + + return False + + +def yield_sources(src_tree: pathlib.Path): + """Perform a filesystem walk and yield any paths that should be scanned.""" + for parent, dirs, files in os.walk(src_tree): + parent = pathlib.Path(parent) + relpath_parent = parent.relative_to(src_tree) + + # Filter out ignored directories + dirs[:] = sorted(filter(DirectoryFilter(relpath_parent), dirs)) + + for file in sorted(filter(FileFilter(relpath_parent), files)): + yield parent / file + + +ThirdpartyViolation = collections.namedtuple( + "ThirdpartyViolation", ["srcfile", "lineno", "note", "line"] +) + + +def yield_potential_thirdparty( + fullpath: pathlib.Path, relpath: pathlib.Path +) -> Generator[ThirdpartyViolation, None, None]: + """Look for bad patterns with third-party sources. + + Look for patterns that might indicate the addition of new third-party + sources. + """ + with fullpath.open("r", encoding="utf-8") as infile: + for lineno, line in enumerate(infile): + lineno += 1 # Make line numbers 1-based + + if "FetchContent_Declare" in line: + note = "Invalid use of FetchContent_Declare outside of 3rdparty/CMakeLists.txt" + yield ThirdpartyViolation(relpath, lineno, note, line.strip()) + + if "ExternalProject_Add" in line: + note = "Invalid use of ExternalProject_Add outside of 3rdparty/CMakeLists.txt" + yield ThirdpartyViolation(relpath, lineno, note, line.strip()) + + +def check_sources(src_tree: pathlib.Path) -> int: + """Common entry-point between main() and pytest. + + Prints any violations to stderr and returns non-zero if any violations are + found. + """ + violations = [] + for filepath in yield_sources(src_tree): + for violation in yield_potential_thirdparty(filepath, filepath.relative_to(src_tree)): + violations.append(violation) + + if not violations: + return 0 + + for violation in sorted(violations): + sys.stderr.write( + f"{violation.srcfile}:{violation.lineno}: {violation.note}\n" + + f" {violation.line}\n" + ) + + logger.error( + "Found %d potential third-party violations. " + "If you are trying to add a new third-party dependency, " + "please follow the instructions in 3rdparty/cpp-thirdparty.md", + len(violations), + ) + return 1 + + +def test_cmake_listfiles(): + """Test that no third-party violations are found in the source tree.""" + source_tree = pathlib.Path(__file__).parents[1] + result = check_sources(source_tree) + assert result == 0 + + +def main(): + parser = argparse.ArgumentParser(description="__doc__") + parser.add_argument( + "--src-tree", + default=pathlib.Path.cwd(), + type=pathlib.Path, + help="Path to the source tree, defaults to current directory", + ) + args = parser.parse_args() + result = check_sources(args.src_tree) + sys.exit(result) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() diff --git a/tests/integration/defs/thirdparty/test_git_modules.py b/tests/integration/defs/thirdparty/test_git_modules.py new file mode 100644 index 0000000000..1b617a18b2 --- /dev/null +++ b/tests/integration/defs/thirdparty/test_git_modules.py @@ -0,0 +1,105 @@ +"""This script audits the .gitmodules file. + +... to make sure that new git submodules are not added without following the +proper process (cpp/3rdparty/cpp-thirdparty.md) +""" + +import argparse +import collections +import configparser +import logging +import pathlib +import sys + +logger = logging.getLogger(__name__) + +ALLOWLIST_SUBMODULES = [ + # NOTE: please do not add new sobmodules here without following the process + # in 3rdparty/cpp-thirdparty.md. Prefer to use FetchContent or other methods + # to avoid adding new git submodules unless absolutely necessary. +] + +ThirdpartyViolation = collections.namedtuple("ThirdpartyViolation", ["section", "path", "note"]) + + +def find_violations(config: configparser.ConfigParser) -> list[str]: + violations = [] + for section in config.sections(): + if not section.startswith("submodule "): + raise ValueError(f"Unexpected section in .gitmodules: {section}") + + path = config[section].get("path", "") + if not path: + raise ValueError(f"Missing path for submodule {section}") + + if path not in ALLOWLIST_SUBMODULES: + violations.append( + ThirdpartyViolation( + section=section, + path=path, + note="Submodule not in allowlist (see test_git_modules.py)", + ) + ) + + if not path.startswith("3rdparty/"): + violations.append( + ThirdpartyViolation( + section=section, + path=path, + note="Submodule path must be under 3rdparty/", + ) + ) + return violations + + +def check_modules_file(git_modules_path: pathlib.Path) -> int: + """Common entry-point between main() and pytest. + + Prints any violations to stderr and returns non-zero if any violations are + found. + """ + config = configparser.ConfigParser() + config.read(git_modules_path) + + violations = find_violations(config) + + if violations: + for violation in violations: + sys.stderr.write(f"{violation.section} (path={violation.path}): {violation.note}\n") + + logger.error( + "Found %d potential third-party violations. " + "If you are trying to add a new third-party dependency, " + "please follow the instructions in cpp/3rdparty/cpp-thirdparty.md", + len(violations), + ) + return 1 + return 0 + + +def test_gitmodules(): + """Test that no git submodules are added to .gitmodules. + + ... without following the defined process. + """ + git_modules_path = pathlib.Path(__file__).parents[1] / ".gitmodules" + result = check_modules_file(git_modules_path) + assert result == 0 + + +def main(): + parser = argparse.ArgumentParser(description="__doc__") + parser.add_argument( + "--git-modules-path", + default=pathlib.Path(".gitmodules"), + type=pathlib.Path, + help="Path to the .gitmodules file, defaults to .gitmodules in current directory", + ) + args = parser.parse_args() + result = check_modules_file(args.git_modules_path) + sys.exit(result) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 156cfc3d10..0d7d4ee601 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -95,6 +95,9 @@ l0_a10: - llmapi/test_llm_api_connector.py::test_connector_disagg_prefill[False] - llmapi/test_llm_api_connector.py::test_connector_disagg_prefill[True] - llmapi/test_llm_api_connector.py::test_connector_multi_request + # third-party policy checks CPU-only + - thirdparty/test_cmake_third_party.py::test_cmake_listfiles + - thirdparty/test_git_modules.py::test_gitmodules - condition: ranges: system_gpu_count: From ece3a8748fa121880b22d1c60eb9030f05053535 Mon Sep 17 00:00:00 2001 From: Tian Zheng <29906817+Tom-Zheng@users.noreply.github.com> Date: Wed, 10 Dec 2025 22:20:12 +0800 Subject: [PATCH 062/172] [None][doc] Update doc for NVFP4 KV cache (#9475) Signed-off-by: Tian Zheng <29906817+Tom-Zheng@users.noreply.github.com> --- docs/source/features/quantization.md | 87 ++++++++++++++++++---------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/docs/source/features/quantization.md b/docs/source/features/quantization.md index e057a91b39..7998f1c03a 100644 --- a/docs/source/features/quantization.md +++ b/docs/source/features/quantization.md @@ -11,6 +11,7 @@ TensorRT LLM offers a variety of quantization recipes to optimize LLM inference. * FP8 Block Scaling * FP8 Rowwise * FP8 KV Cache +* NVFP4 KV Cache * W4A16 GPTQ * W4A8 GPTQ * W4A16 AWQ @@ -47,6 +48,20 @@ llm = LLM(model='/path/to/model', llm.generate("Hello, my name is") ``` +#### NVFP4 KV Cache + +To enable NVFP4 KV cache, offline quantization with ModelOpt is required. Please follow the below section for instructions. +After the quantization is done, the NVFP4 KV cache option can be set by: + +```python +from tensorrt_llm import LLM +from tensorrt_llm.llmapi import KvCacheConfig +llm = LLM(model='/path/to/model', + kv_cache_config=KvCacheConfig(dtype='nvfp4')) +llm.generate("Hello, my name is") +``` + + ### Offline Quantization with ModelOpt If a pre-quantized model is not available on the [Hugging Face Hub](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4), you can quantize it offline using ModelOpt. @@ -56,33 +71,45 @@ Follow this step-by-step guide to quantize a model: ```bash git clone https://github.com/NVIDIA/Model-Optimizer.git cd Model-Optimizer/examples/llm_ptq -scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf +scripts/huggingface_example.sh --model --quant fp8 ``` +#### NVFP4 KV Cache + +To generate the checkpoint for NVFP4 KV cache: + +```bash +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd TensorRT-Model-Optimizer/examples/llm_ptq +scripts/huggingface_example.sh --model --quant fp8 --kv_cache_quant nvfp4 +``` + +Note that currently TRT-LLM only supports FP8 weight/activation quantization when NVFP4 KV cache is enabled. Therefore, `--quant fp8` is required here. + ## Model Supported Matrix -| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | -| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: | -| BERT | . | . | . | . | . | Y | . | . | . | . | -| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . | -| EXAONE | . | . | Y | . | . | Y | Y | Y | . | . | -| Gemma 3 | . | . | Y | . | . | Y | Y | Y | . | . | -| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . | -| LLaMA | Y | . | Y | . | . | Y | . | Y | . | Y | -| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | . | Y | -| LLaMA 3 | . | . | . | . | Y | Y | Y | . | . | . | -| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . | -| Mistral | . | . | Y | . | . | Y | . | Y | . | . | -| Mixtral | Y | . | Y | . | . | Y | . | . | . | . | -| Phi | . | . | . | . | . | Y | Y | . | . | . | -| Qwen | . | . | . | . | . | Y | Y | Y | . | Y | -| Qwen-2/2.5 | Y | . | Y | . | . | Y | Y | Y | . | Y | -| Qwen-3 | Y | . | Y | . | . | Y | . | Y | . | Y | -| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . | -| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . | -| LLaVA | . | . | Y | . | . | Y | . | Y | . | Y | -| VILA | . | . | Y | . | . | Y | . | Y | . | Y | -| Nougat | . | . | . | . | . | Y | . | . | . | . | +| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache | NVFP4 KV Cache | W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | +| :------------- | :---: | :---: | :---: | :---: | :---: | :---: |:---:| :-------: | :-------: | :--------: | :--------: | +| BERT | . | . | . | . | . | Y | . | . | . | . | . | +| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . | . | +| EXAONE | . | . | Y | . | . | Y | . | Y | Y | . | . | +| Gemma 3 | . | . | Y | . | . | Y | . | Y | Y | . | . | +| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . | . | +| LLaMA | Y | . | Y | . | . | Y | . | . | Y | . | Y | +| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | Y | . | Y | +| LLaMA 3 | . | . | . | . | Y | Y | Y | Y | . | . | . | +| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . | . | +| Mistral | . | . | Y | . | . | Y | . | . | Y | . | . | +| Mixtral | Y | . | Y | . | . | Y | . | . | . | . | . | +| Phi | . | . | . | . | . | Y | . | Y | . | . | . | +| Qwen | . | . | . | . | . | Y | . | Y | Y | . | Y | +| Qwen-2/2.5 | Y | . | Y | . | . | Y | . | Y | Y | . | Y | +| Qwen-3 | Y | . | Y | . | . | Y | Y | . | Y | . | Y | +| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . | . | +| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . | . | +| LLaVA | . | . | Y | . | . | Y | . | . | Y | . | Y | +| VILA | . | . | Y | . | . | Y | . | . | Y | . | Y | +| Nougat | . | . | . | . | . | Y | . | . | . | . | . | ```{note} @@ -93,13 +120,13 @@ The language component decides which quantization methods are supported by a giv ## Hardware Support Matrix -| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | -| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: | -| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . | -| Blackwell(sm100) | Y | Y | Y | Y | . | Y | . | . | . | . | -| Hopper | . | . | Y | Y | Y | Y | Y | Y | Y | Y | -| Ada Lovelace | . | . | Y | . | . | Y | Y | Y | Y | Y | -| Ampere | . | . | . | . | . | Y | . | Y | . | Y | +| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache | NVFP4 KV Cache | W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | +| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: | +| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . | . | +| Blackwell(sm100) | Y | Y | Y | Y | . | Y | Y | . | . | . | . | +| Hopper | . | . | Y | Y | Y | Y | . | Y | Y | Y | Y | +| Ada Lovelace | . | . | Y | . | . | Y | . | Y | Y | Y | Y | +| Ampere | . | . | . | . | . | Y | . | . | Y | . | Y | ```{note} FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/weight and UE8M0 act/weight scale), which is slightly different from SM90 FP8 recipe (E4M3 act/weight and FP32 act/weight scale). ``` From 2c0293c612638f78ce28706f39f66e0e34c878a7 Mon Sep 17 00:00:00 2001 From: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> Date: Wed, 10 Dec 2025 13:42:26 -0500 Subject: [PATCH 063/172] [https://nvbugs/5601682][fix] Unwaiving disagg test (#9627) Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- .../disagg_config_deepseek_v3_lite_empty_batch.yaml | 4 ---- tests/integration/test_lists/waives.txt | 2 -- 2 files changed, 6 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_deepseek_v3_lite_empty_batch.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_deepseek_v3_lite_empty_batch.yaml index 3646377829..409a314ec4 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_deepseek_v3_lite_empty_batch.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_deepseek_v3_lite_empty_batch.yaml @@ -12,7 +12,6 @@ context_servers: max_num_tokens: 512 max_seq_len: 768 tensor_parallel_size: 2 - moe_expert_parallel_size: 2 enable_attention_dp: true pipeline_parallel_size: 1 print_iter_log: true @@ -34,7 +33,6 @@ generation_servers: max_num_tokens: 2048 max_seq_len: 2560 tensor_parallel_size: 1 - moe_expert_parallel_size: 1 enable_attention_dp: false enable_lm_head_tp_in_adp: false pipeline_parallel_size: 1 @@ -50,8 +48,6 @@ generation_servers: enable_block_reuse: false free_gpu_memory_fraction: 0.7 max_tokens: 2560 - moe_config: - backend: CUTLASS cache_transceiver_config: max_tokens_in_buffer: 8448 backend: DEFAULT diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index a7fdd1f449..54cba2e79c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -338,8 +338,6 @@ unittest/bindings/test_hostfunc.py::test_hostfunc SKIP (https://nvbugs/5643631) examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) From 341cb1a12cba30233c7cdc712acb56ee252697f5 Mon Sep 17 00:00:00 2001 From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com> Date: Thu, 11 Dec 2025 10:36:55 +0800 Subject: [PATCH 064/172] [None][chore] Add GB300 support since it does not support segment (#9731) Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/README.md | 1 + examples/disaggregated/slurm/benchmark/config.yaml | 1 + examples/disaggregated/slurm/benchmark/submit.py | 3 ++- tests/integration/defs/perf/disagg/utils/common.py | 6 ++++++ tests/integration/defs/perf/disagg/utils/config_loader.py | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/disaggregated/slurm/benchmark/README.md b/examples/disaggregated/slurm/benchmark/README.md index 4f70f6481c..1d11aba581 100644 --- a/examples/disaggregated/slurm/benchmark/README.md +++ b/examples/disaggregated/slurm/benchmark/README.md @@ -30,6 +30,7 @@ slurm: job_time: "02:00:00" job_name: "" extra_args: "" # Additional SLURM arguments (e.g., "--gres=gpu:4 --exclude=node1") + set_segment: true # Optional: whether to set the segment for the job numa_bind: true # Enable NUMA binding for GB200 NVL72 ``` diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index c15748fe93..dde6576d97 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -6,6 +6,7 @@ slurm: job_time: "02:00:00" job_name: "" extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2" + set_segment: true # Optional: whether to set the segment for the job numa_bind: true # Only enable for GB200 NVL72 # Benchmark Mode diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 12ee15aba3..4446a88285 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -54,6 +54,7 @@ def submit_job(config, log_dir): # Extract configurations slurm_config = config['slurm'] slurm_config.setdefault('extra_args', '') + slurm_config.setdefault('set_segment', True) hw_config = config['hardware'] env_config = config['environment'] @@ -160,7 +161,7 @@ def submit_job(config, log_dir): f'--nodes={total_nodes}', f'--ntasks={total_tasks}', f'--ntasks-per-node={hw_config["gpus_per_node"]}', - f'--segment={total_nodes}', + *([] if not slurm_config['set_segment'] else [f'--segment={total_nodes}']), *([arg for arg in slurm_config['extra_args'].split() if arg]), slurm_config['script_file'], # Hardware configuration diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index c050fdd468..cbc5b3823b 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -60,6 +60,12 @@ class EnvManager: def get_slurm_job_name() -> str: return os.getenv("SLURM_JOB_NAME", "unified-benchmark") + @staticmethod + def get_slurm_set_segment() -> bool: + gpu_type = EnvManager.get_gpu_type() + gpu_type_support_segment = {"GB200": True, "GB300": False} + return gpu_type_support_segment.get(gpu_type, False) + @staticmethod def get_container_image() -> str: return os.getenv("CONTAINER_IMAGE", "") diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py index 7ee64d410d..c8dd5e21a6 100644 --- a/tests/integration/defs/perf/disagg/utils/config_loader.py +++ b/tests/integration/defs/perf/disagg/utils/config_loader.py @@ -477,6 +477,7 @@ class ConfigLoader: ("environment", "work_dir"): lambda: EnvManager.get_script_dir(), ("environment", "model_path"): lambda: self._get_full_model_path(config), ("slurm", "script_file"): lambda: self._get_script_file(config), + ("slurm", "set_segment"): lambda: EnvManager.get_slurm_set_segment(), } # Apply overrides based on field paths From c1d53ee43da661b25109a1c94f4e1b282eee684b Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Thu, 11 Dec 2025 11:18:30 +0800 Subject: [PATCH 065/172] [https://nvbugs/5582258][fix] unwaive (#9650) Signed-off-by: Bo Deng --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 54cba2e79c..b4261692f8 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -312,7 +312,6 @@ accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[F accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5569696) triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369) -accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5582258) accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233) examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False] SKIP (https://nvbugs/5629791) From 81222c36705e463cd3b0b138bdab272108536186 Mon Sep 17 00:00:00 2001 From: Ziyi Xiong <219238287+ziyixiong-nv@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:22:38 +0800 Subject: [PATCH 066/172] [None] Fix warning when capturing CUDA graph (#9746) Signed-off-by: ziyixiong-nv <219238287+ziyixiong-nv@users.noreply.github.com> --- tensorrt_llm/_torch/speculative/drafting_loops.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorrt_llm/_torch/speculative/drafting_loops.py b/tensorrt_llm/_torch/speculative/drafting_loops.py index f044fdd106..159cd9d528 100644 --- a/tensorrt_llm/_torch/speculative/drafting_loops.py +++ b/tensorrt_llm/_torch/speculative/drafting_loops.py @@ -19,6 +19,9 @@ from tensorrt_llm._torch.speculative.eagle3 import Eagle3SpecMetadata from tensorrt_llm._torch.speculative.interface import SpecMetadata from tensorrt_llm._torch.speculative.spec_tree_manager import SpecTreeManager +# Enable capture_scalar_outputs to avoid graph breaks from Tensor.item() calls +torch._dynamo.config.capture_scalar_outputs = True + class BaseDraftingLoopWrapper(ABC, torch.nn.Module): From 454e7e59e550f5d0b296f50eddfaa98686e41e7c Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Thu, 11 Dec 2025 12:20:33 +0800 Subject: [PATCH 067/172] [https://nvbugs/5718004][fix] Add warmup for cancellation test (#9860) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tests/unittest/llmapi/apps/_test_openai_misc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py index 8cc715389f..9e1b1a8dbe 100644 --- a/tests/unittest/llmapi/apps/_test_openai_misc.py +++ b/tests/unittest/llmapi/apps/_test_openai_misc.py @@ -89,6 +89,13 @@ async def test_request_cancellation(server: RemoteOpenAIServer, # clunky test: send an ungodly amount of load in with short timeouts # then ensure that it still responds quickly afterwards chat_input = [{"role": "user", "content": "Write a long story"}] + + # Warmup + client = server.get_async_client() + response = await client.chat.completions.create(messages=chat_input, + model=model_name, + max_tokens=10000) + client = server.get_async_client(timeout=0.5, max_retries=3) tasks = [] # Request about 2 million tokens From d147ad053e2824d311069181590cb9933e095a49 Mon Sep 17 00:00:00 2001 From: Kanghwan <861393+karljang@users.noreply.github.com> Date: Wed, 10 Dec 2025 21:51:08 -0800 Subject: [PATCH 068/172] [#2730][fix] Fix circular import bug in medusa/weight.py (#9866) Signed-off-by: Kanghwan Jang <861393+karljang@users.noreply.github.com> --- tensorrt_llm/models/medusa/weight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/models/medusa/weight.py b/tensorrt_llm/models/medusa/weight.py index 049d1d1b3a..6964dbdd3e 100644 --- a/tensorrt_llm/models/medusa/weight.py +++ b/tensorrt_llm/models/medusa/weight.py @@ -11,8 +11,8 @@ from tqdm import tqdm from transformers.models.llama.modeling_llama import LlamaDecoderLayer from transformers.pytorch_utils import Conv1D -from tensorrt_llm import logger from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.convert_utils import (dup_kv_weight, generate_int8, smooth_gemm, @@ -51,7 +51,7 @@ def load_medusa_hf(medusa_path: str, use_weight_only=False, plugin_weight_only_quant_type=None, is_modelopt_ckpt=False): - # logger.info("Loading Medusa heads' weights ...") + logger.info("Loading Medusa heads' weights ...") if is_modelopt_ckpt: from safetensors.torch import load_file From b8a5159fad493acc7ba57ceca69ed18b6c4e99f1 Mon Sep 17 00:00:00 2001 From: ChristinaZ <83400082+ChristinaZ@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:31:23 +0800 Subject: [PATCH 069/172] [None][feat] Enable PDL for indexer topK (#9843) Signed-off-by: Christina Zhang <83400082+ChristinaZ@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/indexerTopK.cu | 77 +++++++++++++++++++++---- 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu index 361748a380..40e377c998 100644 --- a/cpp/tensorrt_llm/kernels/indexerTopK.cu +++ b/cpp/tensorrt_llm/kernels/indexerTopK.cu @@ -589,6 +589,9 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill( int const* rowStarts, int const* rowEnds, int* outIndices, int stride0, int stride1, int const topK, int const offsetIndex) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif // The number of bins in the histogram. static constexpr int kNumBins = 2048; @@ -605,6 +608,9 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill( topKPerRowJob( nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK); +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -612,6 +618,9 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(f int* outIndices, int stride0, int stride1, int const topK, int next_n, float* outLogits = nullptr, int const numBlocksToMerge = 0, int const* indices = nullptr) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif // The number of bins in the histogram. static constexpr int kNumBins = 2048; @@ -646,6 +655,9 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(f topKPerRowJob( indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK); +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux, @@ -660,28 +672,73 @@ void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indic if (numColumns < kSortingAlgorithmThreshold) { // Use insertion sort - topKPerRowDecode<<>>( - logits, seqLens, indices, stride0, stride1, topK, next_n); + auto* kernel_instance = &topKPerRowDecode; + + cudaLaunchConfig_t config; + config.gridDim = numRows; + config.blockDim = kNumThreadsPerBlock; + config.dynamicSmemBytes = topK * sizeof(int32_t); + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + config.numAttrs = 1; + config.attrs = attrs; + + cudaLaunchKernelEx( + &config, kernel_instance, logits, seqLens, indices, stride0, stride1, topK, next_n, nullptr, 0, nullptr); } else if (numColumns < kSplitWorkThreshold) { // From this threshold, use radix sort instead - topKPerRowDecode<<>>( - logits, seqLens, indices, stride0, stride1, topK, next_n); + auto* kernel_instance = &topKPerRowDecode; + + cudaLaunchConfig_t config; + config.gridDim = numRows; + config.blockDim = kNumThreadsPerBlock; + config.dynamicSmemBytes = topK * sizeof(int32_t); + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + config.numAttrs = 1; + config.attrs = attrs; + + cudaLaunchKernelEx( + &config, kernel_instance, logits, seqLens, indices, stride0, stride1, topK, next_n, nullptr, 0, nullptr); } else { // Long sequences are run in two steps constexpr auto multipleBlocksPerRowConfig = 10; + auto* kernel_instance_part1 = &topKPerRowDecode; + cudaLaunchConfig_t config_part1; + config_part1.gridDim = dim3(numRows, multipleBlocksPerRowConfig); + config_part1.blockDim = kNumThreadsPerBlock; + config_part1.dynamicSmemBytes = 2 * topK * sizeof(int32_t); + config_part1.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + config_part1.numAttrs = 1; + config_part1.attrs = attrs; - topKPerRowDecode - <<>>( - logits, seqLens, outIndicesAux, stride0, stride1, topK, next_n, outLogitsAux); + cudaLaunchKernelEx(&config_part1, kernel_instance_part1, logits, seqLens, outIndicesAux, stride0, stride1, topK, + next_n, outLogitsAux, 0, nullptr); constexpr int kNumThreadsPerBlockMerge = 1024; - topKPerRowDecode - <<>>(outLogitsAux, seqLens, indices, - multipleBlocksPerRowConfig * topK, 1, topK, next_n, nullptr, multipleBlocksPerRowConfig, outIndicesAux); + auto* kernel_instance_part2 = &topKPerRowDecode; + cudaLaunchConfig_t config_part2; + config_part2.gridDim = numRows; + config_part2.blockDim = kNumThreadsPerBlockMerge; + config_part2.dynamicSmemBytes = topK * sizeof(int32_t); + config_part2.stream = stream; + // Reuse attrs array since part1 kernel has already been launched + config_part2.numAttrs = 1; + config_part2.attrs = attrs; + + cudaLaunchKernelEx(&config_part2, kernel_instance_part2, outLogitsAux, seqLens, indices, + multipleBlocksPerRowConfig * topK, 1, topK, next_n, nullptr, multipleBlocksPerRowConfig, outIndicesAux); } sync_check_cuda_error(stream); } From c76b428e2e27c6ce0d57e40ac550961733a7298d Mon Sep 17 00:00:00 2001 From: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:21:32 +0800 Subject: [PATCH 070/172] [TRTLLM-9685] [feat] Add gather fc1 kernel by cuteDSL (#9618) Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- .../_torch/custom_ops/cute_dsl_custom_ops.py | 516 ++- ...guous_gather_grouped_gemm_swiglu_fusion.py | 3029 +++++++++++++++++ .../blackwell/custom_pipeline.py | 154 +- .../modules/fused_moe/fused_moe_cute_dsl.py | 14 +- tensorrt_llm/_torch/utils.py | 9 + .../_torch/thop/parallel/test_cute_dsl_moe.py | 207 +- 7 files changed, 3915 insertions(+), 16 deletions(-) create mode 100644 tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 311db068b5..c15d2ac081 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1439,7 +1439,7 @@ repos: additional_dependencies: - tomli # add ignore words list - args: ["-L", "Mor,ans,thirdparty", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"] + args: ["-L", "Mor,ans,thirdparty,subtiles", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.4 hooks: diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py index 897757cf2c..d497ace49b 100644 --- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py @@ -21,6 +21,14 @@ except ImportError: class GroupedGemmInputsHelper: + """Base helper class for grouped GEMM input preparation and tuning. + + Subclasses should override IDX_SHAPE_INFER to specify which input tensor + to use for shape inference in tuning. + """ + # Input tensor index for shape inference - subclass can override + IDX_A = 0 + IDX_SHAPE_INFER = IDX_A # Default: use a tensor for shape inference def __init__(self, num_experts: int, top_k: int, num_local_experts: int, local_expert_offset: int, tile_size: int): @@ -63,10 +71,11 @@ class GroupedGemmInputsHelper: last_positive_power_of_2(self.infer_num_tokens(x))) def infer_shape_num_tokens(self, input_shapes: List[torch.Size]) -> int: - return self.infer_num_tokens(input_shapes[0][0]) + return self.infer_num_tokens(input_shapes[self.IDX_SHAPE_INFER][0]) def infer_shape_max_num_tiles(self, input_shapes: List[torch.Size]) -> int: - return input_shapes[0][0] // self.tile_size + """Infer max_num_tiles from the shape inference tensor (IDX_SHAPE_INFER).""" + return input_shapes[self.IDX_SHAPE_INFER][0] // self.tile_size def infer_shape_max_num_permuted_tokens( self, input_shapes: List[torch.Size]) -> int: @@ -187,6 +196,123 @@ class GroupedGemmInputsHelper: return a, b, a_sf, b_sf, alpha, output, tile_idx_to_group_idx, tile_idx_to_mn_limit, permuted_idx_to_expanded_idx, num_non_exiting_tiles, token_final_scales +class GatherGroupedGemmInputsHelper(GroupedGemmInputsHelper): + """Helper class for gather-based grouped GEMM input preparation. + + This subclass handles inputs where: + - a tensor contains original (non-permuted) activations + - permuted_idx_to_expanded_idx specifies the gather pattern + - Shape inference uses permuted_idx_to_expanded_idx size instead of a size + + Input tensor layout: + 0: a - Original input activation (not permuted) + 1: b - Weight tensor + 2: a_sf - Scale factor for a + 3: b_sf - Scale factor for b + 4: alpha - Per-expert scaling factor + 5: tile_idx_to_group_idx - Tile to expert mapping + 6: tile_idx_to_mn_limit - Tile M/N limits + 7: permuted_idx_to_expanded_idx - Token permutation mapping + 8: num_non_exiting_tiles - Number of valid tiles + 9: global_sf - Global scale factor + """ + # Override: use permuted_idx_to_expanded_idx for shape inference + IDX_PERMUTED_IDX_TO_EXPANDED_IDX = 7 + IDX_SHAPE_INFER = IDX_PERMUTED_IDX_TO_EXPANDED_IDX + + def generate_permuted_idx_to_expanded_idx( + self, num_tokens: int, num_tokens_per_expert: List[int], + max_num_permuted_tokens: int) -> List[int]: + """Generate permuted_idx_to_expanded_idx for gather operation. + + Maps permuted index to expanded index (token_idx * top_k + topk_idx). + + Args: + num_tokens: Total number of input tokens + num_tokens_per_expert: List of token counts per expert + max_num_permuted_tokens: Target size of the output list + + Returns: + List of expanded IDs with length = max_num_permuted_tokens, + where permuted_idx_to_expanded_idx[permuted_idx] = expanded_idx + Padding tokens are marked with pad_val + Note: In kernel, use expanded_idx // top_k to get original token_idx + """ + permuted_idx_to_expanded_idx = [] + colmajor_expanded_idx = 0 + for i, curr_num_tokens in enumerate(num_tokens_per_expert): + curr_num_tiles = (curr_num_tokens + self.tile_size - + 1) // self.tile_size + for j in range(curr_num_tiles * self.tile_size): + if j < curr_num_tokens: + token_idx = colmajor_expanded_idx % num_tokens + topk_idx = colmajor_expanded_idx // num_tokens + expanded_idx = token_idx * self.top_k + topk_idx + permuted_idx_to_expanded_idx.append(expanded_idx) + colmajor_expanded_idx += 1 + else: + permuted_idx_to_expanded_idx.append( + self.pad_val) # Padding token + # Pad to max_num_permuted_tokens + while len(permuted_idx_to_expanded_idx) < max_num_permuted_tokens: + permuted_idx_to_expanded_idx.append(self.pad_val) + return permuted_idx_to_expanded_idx + + def inputs_pre_hook(self, inputs: List[torch.Tensor]) -> List[torch.Tensor]: + """Pre-hook for gather-based SwiGLU fusion kernel. + + Generates: + - tile_idx_to_group_idx + - tile_idx_to_mn_limit + - permuted_idx_to_expanded_idx (for gather operation) + - num_non_exiting_tiles + """ + a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, tile_idx_to_mn_limit, \ + permuted_idx_to_expanded_idx, num_non_exiting_tiles, global_sf = inputs + # Verify permuted_idx_to_expanded_idx index matches the class constant + assert inputs[ + self. + IDX_PERMUTED_IDX_TO_EXPANDED_IDX] is permuted_idx_to_expanded_idx + + max_num_permuted_tokens = permuted_idx_to_expanded_idx.size(0) + max_num_tiles = max_num_permuted_tokens // self.tile_size + + num_tokens = self.infer_num_tokens(max_num_permuted_tokens) + num_tokens_per_expert = self.generate_num_tokens_per_expert(num_tokens) + tile_idx_to_group_idx_list = self.generate_tile_idx_to_group_idx( + num_tokens_per_expert) + tile_idx_to_mn_limit_list = self.generate_tile_idx_to_mn_limit( + num_tokens_per_expert) + permuted_idx_to_expanded_idx_list = self.generate_permuted_idx_to_expanded_idx( + num_tokens, num_tokens_per_expert, max_num_permuted_tokens) + num_non_exiting_tiles_val = len(tile_idx_to_group_idx_list) + num_padding_tiles_val = max_num_tiles - num_non_exiting_tiles_val + assert num_non_exiting_tiles_val > 0 + assert num_padding_tiles_val >= 0 + assert len(tile_idx_to_mn_limit_list) == num_non_exiting_tiles_val + assert len(permuted_idx_to_expanded_idx_list) == max_num_permuted_tokens + + tile_idx_to_group_idx = torch.tensor( + tile_idx_to_group_idx_list + [self.pad_val] * num_padding_tiles_val, + dtype=tile_idx_to_group_idx.dtype, + device=tile_idx_to_group_idx.device) + tile_idx_to_mn_limit = torch.tensor( + tile_idx_to_mn_limit_list + [self.pad_val] * num_padding_tiles_val, + dtype=tile_idx_to_mn_limit.dtype, + device=tile_idx_to_mn_limit.device) + permuted_idx_to_expanded_idx = torch.tensor( + permuted_idx_to_expanded_idx_list, + dtype=permuted_idx_to_expanded_idx.dtype, + device=permuted_idx_to_expanded_idx.device) + num_non_exiting_tiles = torch.tensor( + [num_non_exiting_tiles_val], + dtype=num_non_exiting_tiles.dtype, + device=num_non_exiting_tiles.device) + return (a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, + tile_idx_to_mn_limit, permuted_idx_to_expanded_idx, + num_non_exiting_tiles, global_sf) + + class FusedMoEInputsHelper: def __init__(self, num_experts: int, top_k: int, num_local_experts: int, @@ -217,6 +343,8 @@ if IS_CUTLASS_DSL_AVAILABLE: import cutlass import cutlass.cute as cute + from ..cute_dsl_kernels.blackwell.blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion import \ + BlockScaledContiguousGatherGroupedGemmKernel from ..cute_dsl_kernels.blackwell.blockscaled_contiguous_grouped_gemm import \ Sm100BlockScaledContiguousGroupedGemmKernel from ..cute_dsl_kernels.blackwell.blockscaled_contiguous_grouped_gemm_finalize_fusion import \ @@ -1683,6 +1811,390 @@ if IS_CUTLASS_DSL_AVAILABLE: device=input_scale.device) return output, output_scale + class Sm100BlockScaledContiguousGatherGroupedGemmSwigluFusionRunner( + TunableRunner): + kernel_class = BlockScaledContiguousGatherGroupedGemmKernel + kernel_cache = dict() + tuning_config_cache = dict() + + def __init__(self, + num_experts: int, + top_k: int, + num_local_experts: int, + local_expert_offset: int, + tile_size: int, + scaling_vector_size: int = 16): + super().__init__() + self.num_experts = num_experts + self.top_k = top_k + self.num_local_experts = num_local_experts + self.local_expert_offset = local_expert_offset + if tile_size not in [128, 256]: + raise ValueError( + f"Tile size {tile_size} is not supported, it only supports 128 and 256." + ) + self.tile_size = tile_size + self.scaling_vector_size = scaling_vector_size + + if get_sm_version() != 100 and get_sm_version() != 103: + raise ValueError( + f"SM version {get_sm_version()} is not supported for {self.__class__.__name__}, it only supports SM 100 and SM 103" + ) + + def unique_id(self): + return ( + self.num_experts, + self.top_k, + self.num_local_experts, + self.local_expert_offset, + self.tile_size, + self.scaling_vector_size, + ) + + def get_valid_tactics( + self, + inputs: List[torch.Tensor], + profile: OptimizationProfile, + **kwargs, + ) -> List[Tuple[int, int]]: + a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, tile_idx_to_mn_limit, permuted_idx_to_expanded_idx, *_ = inputs + # m is the permuted size from permuted_idx_to_expanded_idx, not from a + m = permuted_idx_to_expanded_idx.size(0) + k = a.size(1) * 2 + l, n = b.size(0), b.size(1) + + if self.tile_size == 128: + mma_tiler_mn_candidates = [(128, 128), (128, 256)] + cluster_shape_mn_candidates = [(1, 1)] + elif self.tile_size == 256: + mma_tiler_mn_candidates = [(256, 128), (256, 256)] + cluster_shape_mn_candidates = [(2, 1)] + else: + raise ValueError(f"Tile size {self.tile_size} is not supported") + + valid_tactics = [] + for mma_tiler_mn, cluster_shape_mn in itertools.product( + mma_tiler_mn_candidates, cluster_shape_mn_candidates): + if self.__class__.kernel_class.can_implement( + ab_dtype=cutlass.Float4E2M1FN, + sf_dtype=cutlass.Float8E4M3FN, + sf_vec_size=self.scaling_vector_size, + acc_dtype=cutlass.Float32, + c_dtype=cutlass.Float4E2M1FN, + mma_tiler_mn=mma_tiler_mn, + cluster_shape_mn=cluster_shape_mn, + m=m, + n=n, + k=k, + l=l, + a_major="k", + b_major="k", + c_major="n", + m_aligned=self.tile_size, + ): + valid_tactics.append((mma_tiler_mn, cluster_shape_mn)) + + return valid_tactics + + def get_tuning_config(self) -> TuningConfig: + key = self.unique_id() + if key not in self.__class__.tuning_config_cache: + helper = GatherGroupedGemmInputsHelper(self.num_experts, + self.top_k, + self.num_local_experts, + self.local_expert_offset, + self.tile_size) + self.__class__.tuning_config_cache[key] = TuningConfig( + # Use permuted_idx_to_expanded_idx (IDX_SHAPE_INFER) for tuning + dynamic_tensor_specs=(DynamicTensorSpec( + GatherGroupedGemmInputsHelper.IDX_SHAPE_INFER, 0, + helper.gen_tuning_buckets, + helper.map_to_tuning_buckets), ), + constraint_specs=(ConstraintSpec( + 0, 0, helper.infer_shape_num_tokens), + ConstraintSpec( + 2, 0, helper.infer_shape_num_tokens), + ConstraintSpec( + 5, 0, + helper.infer_shape_max_num_tiles), + ConstraintSpec( + 6, 0, + helper.infer_shape_max_num_tiles)), + inputs_pre_hook=helper.inputs_pre_hook, + use_cuda_graph=True, + ) + return self.__class__.tuning_config_cache[key] + + def forward(self, inputs: List[torch.Tensor], + tactic: Optional[tuple]) -> torch.Tensor: + a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, tile_idx_to_mn_limit, permuted_idx_to_expanded_idx, num_non_exiting_tiles, global_sf = inputs + # Verify permuted_idx_to_expanded_idx index matches the class constant + assert inputs[ + GatherGroupedGemmInputsHelper. + IDX_PERMUTED_IDX_TO_EXPANDED_IDX] is permuted_idx_to_expanded_idx + assert a.dtype == torch.float4_e2m1fn_x2 + assert a.dim() == 2 + assert b.dtype == torch.float4_e2m1fn_x2 + assert b.dim() == 3 + assert a_sf.dtype == torch.uint8 + assert a_sf.dim() == 2 + assert b_sf.dtype == torch.uint8 + assert b_sf.dim() == 3 + assert alpha.dtype == torch.float32 + assert alpha.dim() == 1 + + # a.size(0) is orig_m (original input size before gather) + # permuted_idx_to_expanded_idx.size(0) is m (permuted size after gather) + orig_m, k = a.size(0), a.size(1) * 2 + m = permuted_idx_to_expanded_idx.size(0) + l, n = b.size(0), b.size(1) + scale_k = k // self.scaling_vector_size + interm_size = n // 2 + assert m % self.tile_size == 0 + assert k % (self.scaling_vector_size * 4) == 0 + assert n % (self.scaling_vector_size * 4 * 2) == 0 + assert b.size(2) * 2 == k + assert a_sf.size(0) == orig_m + assert a_sf.size(1) == scale_k + assert b_sf.size(0) == l + assert b_sf.size(1) == n + assert b_sf.size(2) == scale_k + assert alpha.size(0) == l + + num_tiles = m // self.tile_size + assert tile_idx_to_group_idx.dtype == torch.int32 + assert tile_idx_to_group_idx.size() == (num_tiles, ) + assert tile_idx_to_mn_limit.dtype == torch.int32 + assert tile_idx_to_mn_limit.size() == (num_tiles, ) + assert permuted_idx_to_expanded_idx.dtype == torch.int32 + assert permuted_idx_to_expanded_idx.size() == (m, ) + assert num_non_exiting_tiles.dtype == torch.int32 + assert num_non_exiting_tiles.numel() == 1 + assert global_sf.dtype == torch.float32 + assert global_sf.numel() == 1 + + c = torch.empty(m, interm_size // 2, dtype=a.dtype, device=a.device) + c_sf = torch.empty(m * interm_size // self.scaling_vector_size, + dtype=a_sf.dtype, + device=a_sf.device) + + a_ptr = make_ptr(cutlass.Float4E2M1FN, + a.data_ptr(), + cute.AddressSpace.gmem, + assumed_align=32) + b_ptr = make_ptr(cutlass.Float4E2M1FN, + b.data_ptr(), + cute.AddressSpace.gmem, + assumed_align=32) + a_sf_ptr = make_ptr(cutlass.Float8E4M3FN, + a_sf.data_ptr(), + cute.AddressSpace.gmem, + assumed_align=16) + b_sf_ptr = make_ptr(cutlass.Float8E4M3FN, + b_sf.data_ptr(), + cute.AddressSpace.gmem, + assumed_align=16) + alpha_ptr = make_ptr(cutlass.Float32, alpha.data_ptr(), + cute.AddressSpace.gmem) + tile_idx_to_group_idx_ptr = make_ptr( + cutlass.Int32, tile_idx_to_group_idx.data_ptr(), + cute.AddressSpace.gmem) + tile_idx_to_mn_limit_ptr = make_ptr(cutlass.Int32, + tile_idx_to_mn_limit.data_ptr(), + cute.AddressSpace.gmem) + permuted_idx_to_expanded_idx_ptr = make_ptr( + cutlass.Int32, permuted_idx_to_expanded_idx.data_ptr(), + cute.AddressSpace.gmem) + num_non_exiting_tiles_ptr = make_ptr( + cutlass.Int32, num_non_exiting_tiles.data_ptr(), + cute.AddressSpace.gmem) + global_sf_ptr = make_ptr(cutlass.Float32, global_sf.data_ptr(), + cute.AddressSpace.gmem) + c_ptr = make_ptr(cutlass.Float4E2M1FN, + c.data_ptr(), + cute.AddressSpace.gmem, + assumed_align=32) + c_sf_ptr = make_ptr(cutlass.Float8E4M3FN, + c_sf.data_ptr(), + cute.AddressSpace.gmem, + assumed_align=16) + + torch_stream = torch.cuda.current_stream() + stream = cuda.CUstream(torch_stream.cuda_stream) + + if isinstance(tactic, tuple): + mma_tiler_mn, cluster_shape_mn = tactic + else: + mma_tiler_mn, cluster_shape_mn = (self.tile_size, + 128), (self.tile_size // 128, + 1) + + cache_key = (self.scaling_vector_size, self.tile_size, self.top_k, + mma_tiler_mn, cluster_shape_mn) + if cache_key not in self.__class__.kernel_cache: + gemm = self.__class__.kernel_class( + sf_vec_size=self.scaling_vector_size, + acc_dtype=cutlass.Float32, + mma_tiler_mn=mma_tiler_mn, + cluster_shape_mn=cluster_shape_mn, + vectorized_f32=True, + topk=self.top_k, + ) + # Compute max active clusters on current device + hardware_info = cutlass.utils.HardwareInfo() + max_active_clusters = hardware_info.get_max_active_clusters( + cluster_shape_mn[0] * cluster_shape_mn[1]) + + compiled_gemm = cute.compile( + gemm.wrapper, + a_ptr, + b_ptr, + a_sf_ptr, + b_sf_ptr, + c_ptr, + c_sf_ptr, + alpha_ptr, + tile_idx_to_group_idx_ptr, + tile_idx_to_mn_limit_ptr, + permuted_idx_to_expanded_idx_ptr, + num_non_exiting_tiles_ptr, + global_sf_ptr, + orig_m, + m, + n, + k, + l, + tile_size=self.tile_size, + scaling_vector_size=self.scaling_vector_size, + max_active_clusters=max_active_clusters, + stream=stream, + ) + self.__class__.kernel_cache[cache_key] = compiled_gemm + else: + compiled_gemm = self.__class__.kernel_cache[cache_key] + + compiled_gemm( + a_ptr, + b_ptr, + a_sf_ptr, + b_sf_ptr, + c_ptr, + c_sf_ptr, + alpha_ptr, + tile_idx_to_group_idx_ptr, + tile_idx_to_mn_limit_ptr, + permuted_idx_to_expanded_idx_ptr, + num_non_exiting_tiles_ptr, + global_sf_ptr, + orig_m, + m, + n, + k, + l, + stream=stream, + ) + return c, c_sf + + @torch.library.custom_op( + "trtllm::cute_dsl_nvfp4_gather_grouped_gemm_swiglu_blackwell", + mutates_args=(), + device_types="cuda") + def cute_dsl_nvfp4_gather_grouped_gemm_swiglu_blackwell( + input: torch.Tensor, + weight: torch.Tensor, + input_scale: torch.Tensor, + weight_scale: torch.Tensor, + alpha: torch.Tensor, + tile_idx_to_group_idx: torch.Tensor, + tile_idx_to_mn_limit: torch.Tensor, + permuted_idx_to_expanded_idx: torch.Tensor, + num_non_exiting_tiles: torch.Tensor, + global_sf: torch.Tensor, + num_experts: int, + top_k: int, + num_local_experts: int, + local_expert_offset: int, + tile_size: int, + scaling_vector_size: int = 16, + ) -> Tuple[torch.Tensor, torch.Tensor]: + tuner = AutoTuner.get() + + runner = Sm100BlockScaledContiguousGatherGroupedGemmSwigluFusionRunner( + num_experts, top_k, num_local_experts, local_expert_offset, + tile_size, scaling_vector_size) + inputs = [ + input, weight, input_scale, weight_scale, alpha, + tile_idx_to_group_idx, tile_idx_to_mn_limit, + permuted_idx_to_expanded_idx, num_non_exiting_tiles, global_sf + ] + + _, best_tactic = tuner.choose_one( + "trtllm::cute_dsl_nvfp4_gather_grouped_gemm_swiglu_blackwell", + [runner], + runner.get_tuning_config(), + inputs, + ) + output = runner(inputs, tactic=best_tactic) + return output + + @torch.library.register_fake( + "trtllm::cute_dsl_nvfp4_gather_grouped_gemm_swiglu_blackwell") + def _( + input: torch.Tensor, + weight: torch.Tensor, + input_scale: torch.Tensor, + weight_scale: torch.Tensor, + alpha: torch.Tensor, + tile_idx_to_group_idx: torch.Tensor, + tile_idx_to_mn_limit: torch.Tensor, + permuted_idx_to_expanded_idx: torch.Tensor, + num_non_exiting_tiles: torch.Tensor, + global_sf: torch.Tensor, + num_experts: int, + top_k: int, + num_local_experts: int, + local_expert_offset: int, + tile_size: int, + scaling_vector_size: int = 16, + ) -> Tuple[torch.Tensor, torch.Tensor]: + m = permuted_idx_to_expanded_idx.size(0) + n = weight.size(1) + interm_size = n // 2 + output = torch.empty(m, + interm_size // 2, + dtype=input.dtype, + device=input.device) + output_scale = torch.empty(m * interm_size // scaling_vector_size, + dtype=input_scale.dtype, + device=input_scale.device) + return output, output_scale + + class FusedMoEInputsHelper: + + def __init__(self, num_experts: int, top_k: int, num_local_experts: int, + local_expert_offset: int): + self.num_experts = num_experts + self.top_k = top_k + self.num_local_experts = num_local_experts + self.local_expert_offset = local_expert_offset + + def infer_shape_num_tokens(self, input_shapes: List[torch.Size]) -> int: + return input_shapes[0][0] + + def inputs_pre_hook(self, + inputs: List[torch.Tensor]) -> List[torch.Tensor]: + x, x_sf, token_selected_experts, token_final_scales, *others = inputs + num_tokens = token_selected_experts.size(0) + new_token_final_scales, new_token_selected_experts = torch.randn( + num_tokens, + self.num_experts, + device=token_selected_experts.device).topk(self.top_k, dim=-1) + new_token_selected_experts = new_token_selected_experts.to( + token_selected_experts.dtype) + new_token_final_scales = new_token_final_scales.softmax(dim=-1).to( + token_final_scales.dtype) + return x, x_sf, new_token_selected_experts, new_token_final_scales, *others + class Sm100BlockScaledFusedMoERunner(TunableRunner): tuning_config_cache = dict() diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py new file mode 100644 index 0000000000..5b9d06bb17 --- /dev/null +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py @@ -0,0 +1,3029 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import Optional, Tuple, Type, Union + +import cuda.bindings.driver as cuda +import cutlass +import cutlass.cute as cute +import cutlass.pipeline as pipeline +import cutlass.utils as utils +import cutlass.utils.blackwell_helpers as sm100_utils +import cutlass.utils.blockscaled_layout as blockscaled_utils +from cutlass._mlir.dialects import math, nvvm +from cutlass.cute.nvgpu import cpasync, tcgen05 +from cutlass.cute.typing import Float32 +from cutlass.cutlass_dsl import T, dsl_user_op + +from .custom_pipeline import PipelineCpAsyncUmma +from .utils import is_power_of_2 + + +@dsl_user_op +def fmin( + a: Union[float, Float32], b: Union[float, Float32], *, nan=False, loc=None, ip=None +) -> Float32: + return Float32( + nvvm.fmin( + T.f32(), + Float32(a).ir_value(loc=loc, ip=ip), + Float32(b).ir_value(loc=loc, ip=ip), + nan=nan, + loc=loc, + ip=ip, + ) + ) + + +def sigmoid_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]: + """ + Compute the sigmoid of the input tensor. + """ + return cute.arch.rcp_approx(1.0 + cute.math.exp(-a, fastmath=fastmath)) + + +def silu_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]: + """ + Compute the silu of the input tensor. + """ + return a * sigmoid_f32(a, fastmath=fastmath) + + +""" +High-performance persistent blockscaled contiguous grouped dense GEMM with gather and SwiGLU fusion +(C = up * silu(gate), where up and gate come from interleaved weight matrix B) +example for the NVIDIA Blackwell architecture using CUTE DSL. + +This kernel performs FC1 layer computation with SwiGLU activation fusion: +1. GEMM: acc = alpha * (SFA * A[token_ids]) * (SFB * B) +2. SwiGLU: C = up * silu(gate), where up/gate are extracted from interleaved acc (granularity=64) +3. Optional Quant: When c_dtype is Float4E2M1FN, generates scale factor C and quantizes output + +- Matrix A is MxKx1, A can be row-major("K"), ValidM is composed of valid m in different groups +- Matrix B is NxKxL, B can be column-major("K"), L is grouped dimension (number of experts) + - B weights are interleaved: [up_0:64, gate_64:128, up_128:192, gate_192:256, ...] +- Matrix C is Mx(N/2)x1, C can be row-major("N"), N is halved due to SwiGLU fusion +- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, + which has M×ceil_div(K, sf_vec_size)×1 elements +- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, + which has N×ceil_div(K, sf_vec_size)×L elements +- Token ID mapping tensor enables gather operation for A and SFA + +Matrix A/C Memory Layout Diagrams: + + ``` + Group 0 Group 1 Group 2 + -+---------+---------+---------+ + | | | | + K| ValidM0 | ValidM1 | ValidM2 | + | | | | + -+---------+---------+---------+ + |<- ValidM ->| + ``` + Note: the Group(L) dimension will be flatted into M dimension, and the rest Group(L) size is 1. + each ValidM will be aligned to 256 or 128. The alignment is determined by the mma_tiler_mn parameter. + For NVFP4, 2CTA, the alignment is 256. For NVFP4, 1CTA, the alignment is 128. + +This GEMM kernel supports the following features: + - Utilizes LDGSTS (Load Global to Shared with Swizzle) for A and SFA with gather operation + - Utilizes Tensor Memory Access (TMA) for B and SFB matrices + - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations + - Implements TMA multicast with cluster to reduce L2 memory traffic + - Support persistent tile scheduling to better overlap memory load/store with mma between tiles + - Support warp specialization to avoid explicit pipelining between mainloop load and mma + +This GEMM works as follows: +1. SCHEDULER warp (warp 10): Dispatches tile information to all consumer warps via tile_info_pipeline. +2. LDGSTS A/SFA warps (warps 4-7): + - Load A matrix from global memory (GMEM) to shared memory (SMEM) using LDGSTS instructions with gather. + - Load SFA (scale factor A) from GMEM to SMEM using LDGSTS instructions. + - Uses token_id_mapping to perform permutation/gather during load. +3. TMA B/SFB warp (warp 9): + - Load B and SFB matrices from GMEM to SMEM using TMA operations with multicast. +4. MMA warp (warp 8): + - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction. + - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction. +5. EPILOGUE warps (warps 0-3): + - Load two accumulator subtiles (up and gate) from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld. + - Apply alpha scaling: up_scaled = alpha * up, gate_scaled = alpha * gate + - Compute SwiGLU activation: output = up_scaled * silu(gate_scaled), where silu(x) = x * sigmoid(x) + - If c_dtype is Float4E2M1FN: generate scale factor C (SFC) and quantize output + - Type convert output to c_dtype. + - Store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations. + +SM100 tcgen05.mma.kind.block_scale instructions operate as follows: +- Read matrix A from SMEM +- Read matrix B from SMEM +- Read scalefactor A from TMEM +- Read scalefactor B from TMEM +- Write accumulator to TMEM +The accumulator in TMEM must then be loaded to registers before writing back to GMEM. + +Constraints: +* Supported input data types: mxf8, mxf4, nvf4 + see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation +* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4) +* Mma tiler M must be 128 or 256(use_2cta_instrs) +* Mma tiler N must be 64/128/192/256 +* Cluster shape M/N must be positive and power of 2, total cluster size <= 16 +* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs) +* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned, + i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively. + +CUDA Graph Support: +* For CUDA graph support, the tile_idx_to_expert_idx, token_id_mapping, A/C matrices, + and scale factor A can be padded to a larger size + (e.g., permuted_m = m*topK + num_local_experts*(256-1), + example: 4096*8 + (256/32)*255 = 34808) +* Use create_tensors() with permuted_m parameter to automatically pad: + - tile_idx_to_expert_idx: padded for invalid tiles (set to -2e9 for padding tiles) + - token_id_mapping: padded to permuted_m size (invalid tokens set to -1) + - A matrix: padded to permuted_m rows (padding rows contain dummy data) + - C matrix: padded to permuted_m rows (output buffer for cuda_graph) + - Scale factor A: padded to match A matrix dimensions +* Kernel handling of padding: + - Scheduler warp checks if tile_idx >= num_non_exiting_tiles to exit + - Only valid tiles (tile_idx < num_non_exiting_tiles) are written to tile_info pipeline + - LDGSTS warps use token_id_mapping predicates to skip invalid tokens (token_id == -1) + - When no more valid tiles exist, outer loop exits and calls producer_tail() + - Consumer warps process only valid tiles from pipeline + - No deadlock or synchronization issues +* Consumer warps check initial tile against num_non_exiting_tiles and set + is_valid_tile=False if tile_idx >= num_non_exiting_tiles +* Only rows within (aligned_groupm[0]+aligned_groupm[1]+...) contain valid data +* Padding rows in C matrix will not be written by the kernel +""" + + +class BlockScaledContiguousGatherGroupedGemmKernel: + """This class implements contiguous grouped matrix multiplication with gather operation and SwiGLU fusion + for FC1 layer computation (C = up * silu(gate), where up/gate come from interleaved GEMM result). + + The computation flow: + 1. GEMM: acc = alpha * (SFA * A[token_ids]) * (SFB * B) + 2. SwiGLU: C = up * silu(gate), extracted from interleaved acc with granularity=64 + 3. Optional Quant: When c_dtype is Float4E2M1FN, generates SFC and quantizes output + + Note: Output C has N/2 columns since pairs of (up, gate) are combined by SwiGLU. + + Key Features: + - Uses LDGSTS instructions for loading A and SFA matrices with gather/permutation capability + - Uses TMA (Tensor Memory Access) for loading B and SFB matrices with multicast + - Token ID mapping enables efficient gather operation during A/SFA load + - SwiGLU activation fusion in epilogue (up * silu(gate) with interleaved weights) + - Optional quantization fusion for Float4E2M1FN output with scale factor generation + - Warp specialization: Scheduler (warp 10), LDGSTS A/SFA (warps 4-7), TMA B/SFB (warp 9), + MMA (warp 8), Epilogue (warps 0-3) + + :param sf_vec_size: Scalefactor vector size (16 for NVF4, 32 for MXF4/MXF8). + :type sf_vec_size: int + :param acc_dtype: Data type of the accumulator (e.g., cutlass.Float32). + :type acc_dtype: Type[cutlass.Numeric] + :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N). + Note: use_2cta_instrs is automatically inferred from mma_tiler_mn[0] + (True when M=256, False when M=128). + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing + :type cluster_shape_mn: Tuple[int, int] + :param vectorized_f32: Whether to use vectorized f32x2 operations for better performance. + :type vectorized_f32: bool + + :note: In current version, A and B tensor must have the same data type + - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported + + :note: Supported combinations of A/B data types, SF data typs and SF vector size: + - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32 + - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32 + - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16 + + :note: Supported accumulator data types: + - Float32 + + :note: Supported C data types: + - Float32 + - Float16/BFloat16 + - Float8E4M3FN/Float8E5M2 + # {$nv-internal-release begin} + # Note: Float4E2M1FN output includes SFC generation and quantization support for internal testing. + - Float4E2M1FN (with scale factor generation) + # {$nv-internal-release end} + + :note: Constraints: + - MMA tiler M must be 128 or 256 (use_2cta_instrs) + - MMA tiler N must be 64/128/192/256 + - Cluster shape M must be multiple of 2 if Mma tiler M is 256 + - Cluster shape M/N must be positive and power of 2, total cluster size <= 16 + - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors + + Example: + >>> # Note: use_2cta_instrs is auto-inferred from mma_tiler_mn[0] + >>> # (True when M=256, False when M=128) + >>> gemm = BlockScaledContiguousGatherGroupedGemmKernel( + ... sf_vec_size=16, + ... acc_dtype=cutlass.Float32, + ... mma_tiler_mn=(256, 128), # use_2cta_instrs=True since M=256 + ... cluster_shape_mn=(2, 1), + ... vectorized_f32=True, + ... ) + >>> gemm( + ... a=a_tensor, + ... b=b_tensor, + ... c=c_tensor, + ... sfa=sfa_tensor, + ... sfb=sfb_tensor, + ... sfc_tensor=None, + ... norm_const_tensor=None, + ... tile_idx_to_expert_idx=tile_idx_to_expert_idx, + ... tile_idx_to_mn_limit=tile_idx_to_mn_limit, + ... token_id_mapping_tensor=token_id_mapping_tensor, + ... num_non_exiting_tiles=num_non_exiting_tiles, + ... alpha=alpha, + ... max_active_clusters=max_active_clusters, + ... stream=stream, + ... ) + """ + + def __init__( + self, + sf_vec_size: int, + acc_dtype: Type[cutlass.Numeric], + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + vectorized_f32: bool, + topk: int, + ): + """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel with + gather operation and SwiGLU fusion. + + This configuration includes several key aspects: + + 1. MMA Instruction Settings (tcgen05): + - acc_dtype: Data types for MMA accumulator. + - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler. + - use_2cta_instrs: Automatically inferred from mma_tiler_mn[0] + (True when M=256, False when M=128). + + 2. Cluster Shape: + - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster. + + 3. Scale Factor Configuration: + - sf_vec_size: Vector size for block-scaled quantization. + + 4. Performance Optimization: + - vectorized_f32: Enable vectorized f32x2 operations. + + 5. MoE Configuration: + - topk: Number of experts selected per token (used for token ID mapping). + + :param sf_vec_size: Vector size for scale factors (16 for NVF4, 32 for MXF4/MXF8). + :type sf_vec_size: int + :param acc_dtype: Data type of the accumulator. + :type acc_dtype: type[cutlass.Numeric] + :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction. + use_2cta_instrs is automatically set based on M (True if M=256, False if M=128). + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster. + :type cluster_shape_mn: Tuple[int, int] + :param vectorized_f32: Enable vectorized f32x2 operations for better performance. + :type vectorized_f32: bool + :param topk: Number of experts selected per token (used for token ID mapping). + :type topk: int + """ + + self.sf_vec_size = sf_vec_size + self.topk = topk + self.acc_dtype: Type[cutlass.Numeric] = acc_dtype + self.use_2cta_instrs = mma_tiler_mn[0] == 256 + self.cluster_shape_mn = cluster_shape_mn + # K dimension is deferred in _setup_attributes + self.mma_tiler = (*mma_tiler_mn, 1) + + self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE + + self.occupancy = 1 + self.epilog_warp_id = (0, 1, 2, 3) + self.ldgsts_a_warp_id = ( + 4, + 5, + 6, + 7, + ) + self.mma_warp_id = 8 + self.tma_b_warp_id = 9 + self.sched_warp_id = 10 + self.threads_per_warp = 32 + self.threads_per_cta = self.threads_per_warp * len( + ( + self.mma_warp_id, + *self.ldgsts_a_warp_id, + self.tma_b_warp_id, + *self.epilog_warp_id, + self.sched_warp_id, + ) + ) + self.threads_wo_sched = self.threads_per_warp * len( + ( + *self.epilog_warp_id, + self.mma_warp_id, + self.tma_b_warp_id, + *self.ldgsts_a_warp_id, + ) + ) + + # Set barrier for cta sync, epilogue sync and tmem ptr sync + self.cta_sync_barrier = pipeline.NamedBarrier( + barrier_id=1, + num_threads=self.threads_per_cta, + ) + self.epilog_sync_barrier = pipeline.NamedBarrier( + barrier_id=2, + num_threads=32 * len(self.epilog_warp_id), + ) + self.tmem_alloc_barrier = pipeline.NamedBarrier( + barrier_id=3, + num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)), + ) + self.sched_sync_barrier = pipeline.NamedBarrier( + barrier_id=4, + num_threads=self.threads_per_warp, + ) + self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100") + SM100_TMEM_CAPACITY_COLUMNS = 512 + self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS + self.vectorized_f32 = vectorized_f32 + + def _setup_attributes(self): + """Set up configurations that are dependent on GEMM inputs + + This method configures various attributes based on the input tensor properties + (data types, leading dimensions) and kernel settings: + - Configuring tiled MMA + - Computing MMA/cluster/tile shapes + - Computing cluster layout + - Computing multicast CTAs for A/B + - Computing epilogue subtile + - Setting up A/B/C stage counts in shared memory + - Computing A/B/C shared memory layout + - Computing tensor memory allocation columns + """ + + self.mma_inst_shape_mn = ( + self.mma_tiler[0], + self.mma_tiler[1], + ) + # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K) + self.mma_inst_shape_mn_sfb = ( + self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1), + cute.round_up(self.mma_inst_shape_mn[1], 128), + ) + + # Configure tiled mma + tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.sf_dtype, + self.sf_vec_size, + self.cta_group, + self.mma_inst_shape_mn, + ) + + tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.sf_dtype, + self.sf_vec_size, + cute.nvgpu.tcgen05.CtaGroup.ONE, + self.mma_inst_shape_mn_sfb, + ) + + # Compute mma/cluster/tile shapes + mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2]) + mma_inst_tile_k = 4 + self.mma_tiler = ( + self.mma_tiler[0], + self.mma_tiler[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + + self.mma_tiler_sfa = ( + self.mma_inst_shape_mn[0], + self.mma_inst_shape_mn[1], + mma_inst_shape_k * mma_inst_tile_k // 16, + ) + + self.mma_tiler_sfb = ( + self.mma_inst_shape_mn_sfb[0], + self.mma_inst_shape_mn_sfb[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + + self.cta_tile_shape_mnk = ( + self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler[1], + self.mma_tiler[2], + ) + + self.cta_tile_shape_mnk_sfa = ( + self.mma_tiler_sfa[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler_sfa[1], + self.mma_tiler_sfa[2], + ) + + self.mma_tiler_c = ( + self.mma_inst_shape_mn[0], + self.mma_inst_shape_mn[1] // 2, + mma_inst_shape_k * mma_inst_tile_k, + ) + + self.cta_tile_shape_mnk_c = ( + self.mma_tiler_c[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler_c[1], + self.mma_tiler_c[2], + ) + + # Compute cluster layout + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout((*self.cluster_shape_mn, 1)), + (tiled_mma.thr_id.shape,), + ) + + self.cluster_layout_sfb_vmnk = cute.tiled_divide( + cute.make_layout((*self.cluster_shape_mn, 1)), + (tiled_mma_sfb.thr_id.shape,), + ) + + # Compute number of multicast CTAs for A/B + self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1]) + self.is_b_mcast = self.num_mcast_ctas_b > 1 + + # Compute epilogue subtile + self.epi_tile = (128, 64) + self.epi_tile_cnt = ( + self.cta_tile_shape_mnk_c[0] // self.epi_tile[0], + self.cta_tile_shape_mnk_c[1] // self.epi_tile[1], + ) + + # Setup A/B/C/Scale stage count in shared memory and ACC stage count in tensor memory + ( + self.num_acc_stage, + self.num_ab_stage, + self.num_c_stage, + self.num_tile_stage, + ) = self._compute_stages( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.b_dtype, + self.epi_tile, + self.c_dtype, + self.c_layout, + self.sf_dtype, + self.sf_vec_size, + self.num_smem_capacity, + self.occupancy, + ) + + # Compute A/B/C/Scale shared memory layout + self.a_smem_layout_staged = sm100_utils.make_smem_layout_a( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.num_ab_stage, + ) + self.b_smem_layout_staged = sm100_utils.make_smem_layout_b( + tiled_mma, + self.mma_tiler, + self.b_dtype, + self.num_ab_stage, + ) + self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa( + tiled_mma, + self.mma_tiler, + self.sf_vec_size, + self.num_ab_stage, + ) + self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb( + tiled_mma, + self.mma_tiler, + self.sf_vec_size, + self.num_ab_stage, + ) + + self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi( + self.c_dtype, + self.c_layout, + self.epi_tile, + self.num_c_stage, + ) + + # Compute the number of tensor memory allocation columns + self.num_tmem_alloc_cols = 512 + + @cute.jit + def __call__( + self, + a: cute.Tensor, + b: cute.Tensor, + c: cute.Tensor, + sfa: cute.Tensor, + sfb: cute.Tensor, + sfc_tensor: Optional[cute.Tensor], + norm_const_tensor: Optional[cute.Tensor], + tile_idx_to_expert_idx: cute.Tensor, + tile_idx_to_mn_limit: cute.Tensor, + token_id_mapping_tensor: cute.Tensor, + num_non_exiting_tiles: cute.Tensor, + alpha: cute.Tensor, + max_active_clusters: cutlass.Constexpr, + stream: cuda.CUstream, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + """Execute the contiguous grouped GEMM with gather operation and SwiGLU fusion. + + This method performs FC1 layer computation: + 1. GEMM: acc = alpha * (SFA * A[token_ids]) * (SFB * B) + 2. SwiGLU: C = up * silu(gate), where up/gate are extracted from interleaved acc (granularity=64) + 3. Optional Quant: When c_dtype is Float4E2M1FN, generates SFC and quantizes output + + Data loading: + - A and SFA are loaded using LDGSTS instructions with token-based gather + - B and SFB are loaded using TMA instructions with multicast + - B weights are interleaved: [up_0:64, gate_64:128, up_128:192, gate_192:256, ...] + + Execution steps: + 1. Setup static attributes before smem/grid computation + 2. Setup TMA load/store atoms for B, SFB, and C (no TMA for A/SFA) + 3. Compute grid size with regard to hardware constraints + 4. Define shared storage for kernel + 5. Launch the kernel synchronously with warp specialization: + - Scheduler warp: Dispatches tile information + - LDGSTS warps: Load A and SFA with gather + - TMA warp: Load B and SFB with multicast + - MMA warp: Perform matrix multiply-accumulate + - Epilogue warps: Apply SwiGLU activation, optional quantization, and store results + + :param a: Input tensor A (MxKx1), will be gathered using token_id_mapping + :type a: cute.Tensor + :param b: Input tensor B (NxKxL), L is the number of experts/groups, weights are interleaved for SwiGLU + :type b: cute.Tensor + :param c: Output tensor C (Mx(N/2)x1), N is halved due to SwiGLU fusion + :type c: cute.Tensor + :param sfa: Scale factor tensor A, will be gathered using token_id_mapping + :type sfa: cute.Tensor + :param sfb: Scale factor tensor B + :type sfb: cute.Tensor + :param sfc_tensor: Scale factor tensor C for quantized output (None if not quantizing) + :type sfc_tensor: Optional[cute.Tensor] + :param norm_const_tensor: Normalization constant for scale factor generation + (None if not quantizing) + :type norm_const_tensor: Optional[cute.Tensor] + :param tile_idx_to_expert_idx: Mapping from tile index to expert ID, + shape (permuted_m/cta_tile_m,) where cta_tile_m is the CTA tile M size + :type tile_idx_to_expert_idx: cute.Tensor + :param tile_idx_to_mn_limit: Mapping from tile index to M-N dimension limit + for boundary checking, shape (permuted_m/cta_tile_m,) + :type tile_idx_to_mn_limit: cute.Tensor + :param token_id_mapping_tensor: Token ID mapping for gather operation, shape (permuted_m,) + :type token_id_mapping_tensor: cute.Tensor + :param num_non_exiting_tiles: Number of valid tiles to process (valid_m/cta_tile_m), shape (1,) + :type num_non_exiting_tiles: cute.Tensor + :param alpha: Alpha tensor for each group + :type alpha: cute.Tensor + :param max_active_clusters: Maximum number of active clusters + :type max_active_clusters: cutlass.Constexpr + :param stream: CUDA stream for asynchronous execution + :type stream: cuda.CUstream + :param epilogue_op: Optional elementwise lambda function to apply to the output tensor + :type epilogue_op: cutlass.Constexpr + :raises TypeError: If input data types are incompatible with the MMA instruction. + """ + # Setup static attributes before smem/grid/tma computation + self.a_dtype: Type[cutlass.Numeric] = a.element_type + self.b_dtype: Type[cutlass.Numeric] = b.element_type + self.c_dtype: Type[cutlass.Numeric] = c.element_type + self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type + self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode() + self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode() + self.c_layout = utils.LayoutEnum.from_tensor(c) + + # Check if input data types are compatible with MMA instruction + if cutlass.const_expr(self.a_dtype != self.b_dtype): + raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}") + + # Setup attributes that dependent on gemm inputs + self._setup_attributes() + + # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL) + sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size) + sfb = cute.make_tensor(sfb.iterator, sfb_layout) + + self.generate_sfc = sfc_tensor is not None and norm_const_tensor is not None + if cutlass.const_expr(self.generate_sfc): + sfc_layout = blockscaled_utils.tile_atom_to_shape_SF(c.shape, self.sf_vec_size) + sfc_tensor = cute.make_tensor(sfc_tensor.iterator, sfc_layout) + + tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.sf_dtype, + self.sf_vec_size, + self.cta_group, + self.mma_inst_shape_mn, + ) + + # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs. # {$nv-internal-release} + tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.sf_dtype, + self.sf_vec_size, + cute.nvgpu.tcgen05.CtaGroup.ONE, + self.mma_inst_shape_mn_sfb, + ) + atom_thr_size = cute.size(tiled_mma.thr_id.shape) + + # Setup TMA load for B + b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id) + b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0)) + tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B( + b_op, + b, + b_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + ) + + # Setup TMA load for SFB + sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id) + sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0)) + tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B( + sfb_op, + sfb, + sfb_smem_layout, + self.mma_tiler_sfb, + tiled_mma_sfb, + self.cluster_layout_sfb_vmnk.shape, + internal_type=cutlass.Int16, + ) + + # {$nv-internal-release begin} + # This modifies the layout to handle overlapping 256x(# of scale factors for a single column of B (nNSF)) + # logical blocks for SFB when cta_tile_shape_n=192. + # {$nv-internal-release end} + if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192): + x = tma_tensor_sfb.stride[0][1] + y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4) + + new_shape = ( + (tma_tensor_sfb.shape[0][0], ((2, 2), y)), + tma_tensor_sfb.shape[1], + tma_tensor_sfb.shape[2], + ) + # Use right multiplication for ScaledBasis (3 * x instead of x * 3) + x_times_3 = 3 * x + new_stride = ( + (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)), + tma_tensor_sfb.stride[1], + tma_tensor_sfb.stride[2], + ) + tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride) + tma_tensor_sfb = cute.make_tensor(tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout) + + b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout) + sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout) + self.num_tma_load_bytes = (b_copy_size + sfb_copy_size) * atom_thr_size + + # Setup TMA store for C + tma_atom_c = None + tma_tensor_c = None + epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0)) + tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom( + cpasync.CopyBulkTensorTileS2GOp(), + c, + epi_smem_layout, + self.epi_tile, + ) + + # Compute grid size + self.tile_sched_params, grid = self._compute_grid( + c, self.cta_tile_shape_mnk_c, self.cluster_shape_mn, max_active_clusters + ) + + self.buffer_align_bytes = 1024 + + # Define shared storage for kernel + @cute.struct + class SharedStorage: + # (bidx, bidy, bidz, valid, mn_limit) + sInfo: cute.struct.Align[ + cute.struct.MemRange[cutlass.Int32, 5 * self.num_tile_stage], + # 1 byte alignment + 1, + ] + a_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2] + b_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2] + acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2] + tile_info_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_tile_stage * 2] + tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_holding_buf: cutlass.Int32 + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC: cute.struct.Align[ + cute.struct.MemRange[ + self.c_dtype, + cute.cosize(self.c_smem_layout_staged.outer), + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_M, MMA_K, STAGE) + sA: cute.struct.Align[ + cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)], + self.buffer_align_bytes, + ] + # (MMA, MMA_N, MMA_K, STAGE) + sB: cute.struct.Align[ + cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)], + self.buffer_align_bytes, + ] + # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage) + sSFA: cute.struct.Align[ + cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)], + self.buffer_align_bytes, + ] + # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage) + sSFB: cute.struct.Align[ + cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + + # Launch the kernel synchronously + self.kernel( + tiled_mma, + tiled_mma_sfb, + a, + tma_atom_b, + tma_tensor_b, + sfa, + tma_atom_sfb, + tma_tensor_sfb, + tma_atom_c, + tma_tensor_c, + sfc_tensor, + norm_const_tensor, + tile_idx_to_expert_idx, + tile_idx_to_mn_limit, + token_id_mapping_tensor, + num_non_exiting_tiles, + alpha, + self.cluster_layout_vmnk, + self.cluster_layout_sfb_vmnk, + self.a_smem_layout_staged, + self.b_smem_layout_staged, + self.sfa_smem_layout_staged, + self.sfb_smem_layout_staged, + self.c_smem_layout_staged, + self.epi_tile, + self.tile_sched_params, + epilogue_op, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=(*self.cluster_shape_mn, 1), + smem=self.shared_storage.size_in_bytes(), + stream=stream, + min_blocks_per_mp=1, + ) + return + + def mainloop_s2t_copy_and_partition( + self, + sSF: cute.Tensor, + tSF: cute.Tensor, + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for smem to tmem load for scale factor tensor, then use it to + partition smem memory (source) and tensor memory (destination). + + :param sSF: The scale factor tensor in smem + :type sSF: cute.Tensor + :param tSF: The scale factor tensor in tmem + :type tSF: cute.Tensor + + :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where: + - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t) + - tCsSF_compact_s2t: The partitioned scale factor tensor in smem + - tSF_compact_s2t: The partitioned scale factor tensor in tmem + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + # (MMA, MMA_MN, MMA_K, STAGE) + tCsSF_compact = cute.filter_zeros(sSF) + # (MMA, MMA_MN, MMA_K) + tCtSF_compact = cute.filter_zeros(tSF) + + # Make S2T CopyAtom and tiledCopy + copy_atom_s2t = cute.make_copy_atom( + tcgen05.Cp4x32x128bOp(self.cta_group), + self.sf_dtype, + ) + tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact) + thr_copy_s2t = tiled_copy_s2t.get_slice(0) + + # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE) + tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact) + # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE) + tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_) + # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K) + tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact) + + return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t + + # GPU device kernel + @cute.kernel + def kernel( + self, + tiled_mma: cute.TiledMma, + tiled_mma_sfb: cute.TiledMma, + mA_mkl: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB_nkl: cute.Tensor, + mSFA_mkl: cute.Tensor, + tma_atom_sfb: cute.CopyAtom, + mSFB_nkl: cute.Tensor, + tma_atom_c: cute.CopyAtom, + mC_mnl: cute.Tensor, + mSFC_mnl: Optional[cute.Tensor], + norm_const_tensor: Optional[cute.Tensor], + tile_idx_to_expert_idx: cute.Tensor, + tile_idx_to_mn_limit: cute.Tensor, + token_id_mapping_tensor: cute.Tensor, + num_non_exiting_tiles: cute.Tensor, + alpha: cute.Tensor, + cluster_layout_vmnk: cute.Layout, + cluster_layout_sfb_vmnk: cute.Layout, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + sfa_smem_layout_staged: cute.Layout, + sfb_smem_layout_staged: cute.Layout, + c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None], + epi_tile: cute.Tile, + tile_sched_params: utils.PersistentTileSchedulerParams, + epilogue_op: cutlass.Constexpr, + ): + """ + GPU device kernel performing the Persistent batched GEMM computation. + """ + warp_idx = cute.arch.warp_idx() + warp_idx = cute.arch.make_warp_uniform(warp_idx) + + # + # Prefetch tma desc + # + if warp_idx == self.tma_b_warp_id: + # cpasync.prefetch_descriptor(tma_atom_a) + cpasync.prefetch_descriptor(tma_atom_b) + # cpasync.prefetch_descriptor(tma_atom_sfa) + cpasync.prefetch_descriptor(tma_atom_sfb) + cpasync.prefetch_descriptor(tma_atom_c) + + use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2 + + # + # Setup cta/thread coordinates + # + # Coords inside cluster + bidx, bidy, bidz = cute.arch.block_idx() + mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape) + is_leader_cta = mma_tile_coord_v == 0 + cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster()) + block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster) + + block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord( + cta_rank_in_cluster + ) + + # Coord inside cta + tidx, _, _ = cute.arch.thread_idx() + + # + # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier + # + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + # Pipeline Init: Initialize A pipeline for LDGSTS operations + # Producer: 4 warps (warps 4-7) with 128 threads total for LDGSTS operations + # Consumer: MMA warp for consuming A/SFA data + a_pipeline_producer_group = pipeline.CooperativeGroup( + pipeline.Agent.Thread, + 128 + * cute.size( + cluster_layout_vmnk, mode=[0] + ), # 4 warps * 32 threads per warp = 128 threads + ) + + a_pipeline = PipelineCpAsyncUmma.create( + barrier_storage=storage.a_mbar_ptr.data_ptr(), + num_stages=self.num_ab_stage, + producer_group=a_pipeline_producer_group, + consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread), + cta_layout_vmnk=cluster_layout_vmnk, + defer_sync=True, + enable_cp_async=(not self.use_2cta_instrs), + ) + + # Pipeline Init: Initialize B pipeline for TMA operations + # Using PipelineTmaUmma for B/SFB since they use TMA load with multicast support + # Producer: TMA B/SFB warp (warp 9) - 1 warp issuing TMA operations + # Consumer: MMA warp for consuming B/SFB data + b_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread) + num_tma_producer = self.num_mcast_ctas_b + b_pipeline_consumer_group = pipeline.CooperativeGroup( + pipeline.Agent.Thread, num_tma_producer + ) + b_pipeline = pipeline.PipelineTmaUmma.create( + barrier_storage=storage.b_mbar_ptr.data_ptr(), + num_stages=self.num_ab_stage, + producer_group=b_pipeline_producer_group, + consumer_group=b_pipeline_consumer_group, + tx_count=self.num_tma_load_bytes, # Total bytes loaded by TMA (B + SFB) + cta_layout_vmnk=cluster_layout_vmnk, + ) + + # Pipeline Init: Initialize acc_pipeline (barrier) and states + acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread) + num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1) + acc_pipeline_consumer_group = pipeline.CooperativeGroup( + pipeline.Agent.Thread, num_acc_consumer_threads + ) + acc_pipeline = pipeline.PipelineUmmaAsync.create( + barrier_storage=storage.acc_mbar_ptr.data_ptr(), + num_stages=self.num_acc_stage, + producer_group=acc_pipeline_producer_group, + consumer_group=acc_pipeline_consumer_group, + cta_layout_vmnk=cluster_layout_vmnk, + ) + + # Pipeline Init: Tensor memory dealloc barrier init + tile_info_pipeline_producer_group = pipeline.CooperativeGroup( + pipeline.Agent.Thread, + self.threads_per_warp * 1, + ) + tile_info_pipeline_consumer_group = pipeline.CooperativeGroup( + pipeline.Agent.Thread, + self.threads_wo_sched, + ) + tile_info_pipeline = pipeline.PipelineAsync.create( + barrier_storage=storage.tile_info_mbar_ptr.data_ptr(), + num_stages=self.num_tile_stage, + producer_group=tile_info_pipeline_producer_group, + consumer_group=tile_info_pipeline_consumer_group, + ) + + # Tensor memory dealloc barrier init + tmem = utils.TmemAllocator( + storage.tmem_holding_buf, + barrier_for_retrieve=self.tmem_alloc_barrier, + allocator_warp_id=self.epilog_warp_id[0], + is_two_cta=use_2cta_instrs, + two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr, + ) + + # Cluster arrive after barrier init + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_arrive_relaxed() + + # + # Setup smem tensor A/B/C/Scale + # + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner) + # (MMA, MMA_M, MMA_K, STAGE) + sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner) + # (MMA, MMA_N, MMA_K, STAGE) + sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner) + # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage) + sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged) + # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage) + sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged) + # (bidx, bidy, bidz, valid) + info_layout = cute.make_layout((5, self.num_tile_stage), stride=(1, 5)) + sInfo = storage.sInfo.get_tensor(info_layout) + + # + # Compute multicast mask for A/B buffer full + # + # a_full_mcast_mask = None + b_full_mcast_mask = None + # sfa_full_mcast_mask = None + sfb_full_mcast_mask = None + if cutlass.const_expr(self.is_b_mcast or use_2cta_instrs): + b_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1 + ) + sfb_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1 + ) + + # + # Local_tile partition global tensors + # + # (bM, bK, loopM, loopK, loopL) + gA_mkl = cute.local_tile( + mA_mkl, cute.slice_(self.cta_tile_shape_mnk, (None, 0, None)), (None, None, None) + ) + # (bN, bK, loopN, loopK, loopL) + gB_nkl = cute.local_tile( + mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None) + ) + + # (bM, bK, RestM, RestK, RestL) + gSFA_mkl = cute.local_tile( + mSFA_mkl, cute.slice_(self.cta_tile_shape_mnk_sfa, (None, 0, None)), (None, None, None) + ) + + # (bN, bK, RestN, RestK, RestL) + gSFB_nkl = cute.local_tile( + mSFB_nkl, + cute.slice_(self.mma_tiler_sfb, (0, None, None)), + (None, None, None), + ) + + gToken_ml = cute.local_tile( + token_id_mapping_tensor, cute.slice_(self.cta_tile_shape_mnk, (None, 0, 0)), (None,) + ) + + # (bM, bN, loopM, loopN, loopL) + gC_mnl = cute.local_tile( + mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None) + ) + k_tile_cnt = cute.size(gA_mkl, mode=[3]) + + # + # Partition global tensor for TiledMMA_A/B/C + # + thr_mma = tiled_mma.get_slice(mma_tile_coord_v) + thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v) + # (MMA, MMA_N, MMA_K, loopN, loopK, loopL) + tCgB = thr_mma.partition_B(gB_nkl) + # (MMA, MMA_N, MMA_K, RestN, RestK, RestL) + tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl) + # (MMA, MMA_M, MMA_N, loopM, loopN, loopL) + tCgC = thr_mma.partition_C(gC_mnl) + + # + # Partition global/shared tensor for TMA load B + # + # TMA load B partition_S/D + b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopM, loopK, loopL) + tBsB, tBgB = cpasync.tma_partition( + tma_atom_b, + block_in_cluster_coord_vmnk[1], + b_cta_layout, + cute.group_modes(sB, 0, 3), + cute.group_modes(tCgB, 0, 3), + ) + + # TMA load SFB partition_S/D + sfb_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), RestN, RestK, RestL) + tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition( + tma_atom_sfb, + block_in_cluster_coord_sfb_vmnk[1], + sfb_cta_layout, + cute.group_modes(sSFB, 0, 3), + cute.group_modes(tCgSFB, 0, 3), + ) + tBsSFB = cute.filter_zeros(tBsSFB) + tBgSFB = cute.filter_zeros(tBgSFB) + + # + # Partition shared/tensor memory tensor for TiledMMA_A/B/C + # + # (MMA, MMA_M, MMA_K, STAGE) + tCrA = tiled_mma.make_fragment_A(sA) + # (MMA, MMA_N, MMA_K, STAGE) + tCrB = tiled_mma.make_fragment_B(sB) + # (MMA, MMA_M, MMA_N) + acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2]) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage)) + + # + # Cluster wait before tensor memory alloc + # + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_wait() + else: + self.cta_sync_barrier.arrive_and_wait() + + # + # Specialized Schedule warp + # + if warp_idx == self.sched_warp_id: + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + # First tile + work_tile = tile_sched.initial_work_tile_info() + + tile_info_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_tile_stage + ) + + while work_tile.is_valid_tile: + cur_tile_coord = work_tile.tile_idx + mma_tile_coord_m = cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape) + if mma_tile_coord_m < num_non_exiting_tiles[0]: + tile_info_pipeline.producer_acquire(tile_info_producer_state) + cur_tile_coord = work_tile.tile_idx + expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m] + mn_limit = tile_idx_to_mn_limit[mma_tile_coord_m] + with cute.arch.elect_one(): + sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[0] + sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[1] + sInfo[(2, tile_info_producer_state.index)] = expert_idx + sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32( + work_tile.is_valid_tile + ) + sInfo[(4, tile_info_producer_state.index)] = mn_limit + # fence view async shared + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + + self.sched_sync_barrier.arrive_and_wait() + tile_info_pipeline.producer_commit(tile_info_producer_state) + tile_info_producer_state.advance() + + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + + tile_info_pipeline.producer_acquire(tile_info_producer_state) + with cute.arch.elect_one(): + sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0] + sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1] + sInfo[(2, tile_info_producer_state.index)] = -1 + sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0) + sInfo[(4, tile_info_producer_state.index)] = -1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + self.sched_sync_barrier.arrive_and_wait() + tile_info_pipeline.producer_commit(tile_info_producer_state) + tile_info_producer_state.advance() + tile_info_pipeline.producer_tail(tile_info_producer_state) + + # + # Specialized LDGSTS A/SFA warps (warps 4-7) + # These warps use LDGSTS instructions to load A and SFA from global to shared memory + # with gather/permutation capability enabled by token_id_mapping + # + if warp_idx <= self.ldgsts_a_warp_id[-1] and warp_idx >= self.ldgsts_a_warp_id[0]: + # cute.arch.warpgroup_reg_dealloc(self.num_regs_uniform_warps) + # + # Setup LDGSTS copy atoms for A and SFA + # A: 8x LDGSTS.128 per thread with swizzle_128B for A matrix (32 elements per thread) + # SFA: 4x LDGSTS.32 per thread with 512-element block swizzling for scale factor A (4 elements per thread) + # + a_atom_copy = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL), + mA_mkl.element_type, + num_bits_per_copy=128, + ) + a_thread_layout = cute.make_layout((16, 8), stride=(8, 1)) + a_value_layout = cute.make_layout((1, 32), stride=(32, 1)) + a_tiled_copy = cute.make_tiled_copy_tv( + a_atom_copy, + a_thread_layout, + a_value_layout, + ) + + sfa_atom_copy = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mSFA_mkl.element_type, + num_bits_per_copy=32, + ) + tidx_in_warpgroup = tidx % 128 + + sA_tiled = cute.make_tensor( + sA.iterator, + layout=cute.make_layout( + (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2], self.num_ab_stage), + stride=( + self.cta_tile_shape_mnk[2], + 1, + self.cta_tile_shape_mnk[0] * self.cta_tile_shape_mnk[2], + ), + ), + ) + a_thr_copy = a_tiled_copy.get_slice(tidx_in_warpgroup) + tAsA_tiled = a_thr_copy.partition_D(sA_tiled) + + a_token_offset_tensor = cute.make_rmem_tensor( + cute.make_layout((8,)), + cutlass.Int32, + ) + a_predicate_tensor = cute.make_rmem_tensor( + cute.make_layout((8,)), + cutlass.Boolean, + ) + sfa_token_offset_tensor = cute.make_rmem_tensor( + cute.make_layout((1,)), + cutlass.Int32, + ) + sfa_predicate_tensor = cute.make_rmem_tensor( + cute.make_layout((1,)), + cutlass.Boolean, + ) + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + # First tile + work_tile = tile_sched.initial_work_tile_info() + + a_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_ab_stage + ) + + tile_info_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_tile_stage + ) + + # Get the first tile info + tile_info = cute.make_rmem_tensor((5,), cutlass.Int32) + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(5, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + + while is_valid_tile: + # Get tile coord from tile scheduler + # cur_tile_coord = work_tile.tile_idx + + # Load token IDs for gather operation + # For A matrix: each thread loads 8 token offsets (for 8 LDGSTS.128 operations) + # For SFA matrix: each thread loads 1 token offset (for 4 LDGSTS.32 operations) + gToken_ml_tile = gToken_ml[(None, tile_info[0])] + for i in range(8): + token_ml_tile_offset = (tidx_in_warpgroup // 8) + i * 16 + a_token_offset_tensor[i] = gToken_ml_tile[token_ml_tile_offset] + a_predicate_tensor[i] = ( + cutlass.Boolean(1) + if tile_info[0] * self.cta_tile_shape_mnk[0] + token_ml_tile_offset + < tile_info[4] + else cutlass.Boolean(0) + ) + a_token_offset_tensor[i] = ( + a_token_offset_tensor[i] // self.topk + if tile_info[0] * self.cta_tile_shape_mnk[0] + token_ml_tile_offset + < tile_info[4] + else 0 + ) + + token_ml_tile_offset = ( + 8 * (tidx_in_warpgroup // 32) + + 32 * ((tidx_in_warpgroup % 32) // 8) + + (tidx_in_warpgroup % 8) + ) + sfa_token_offset_tensor[0] = gToken_ml_tile[token_ml_tile_offset] // self.topk + sfa_predicate_tensor[0] = ( + cutlass.Boolean(1) + if tile_info[0] * self.cta_tile_shape_mnk[0] + token_ml_tile_offset + < tile_info[4] + else cutlass.Boolean(0) + ) + relative_sfa_token_offset = sfa_token_offset_tensor[0] + + tAgA = gA_mkl[(None, None, 0, None, 0)] + A_gmem_thread_offset = cute.assume((tidx_in_warpgroup % 8) * 32, divby=32) + tAgSFA = gSFA_mkl[(relative_sfa_token_offset, None, 0, None, 0)] + + tAsSFA = sSFA[ + ( + ( + ( + ( + 8 * (tidx_in_warpgroup // 32) + (tidx_in_warpgroup % 8), + (tidx_in_warpgroup % 32) // 8, + ), + None, + ), + None, + ), + None, + None, + None, + ) + ] + + # Peek (try_wait) SCALE buffer empty + a_producer_state.reset_count() + peek_a_empty_status = cutlass.Boolean(1) + if a_producer_state.count < k_tile_cnt: + peek_a_empty_status = a_pipeline.producer_try_acquire(a_producer_state) + + # + # Load A and SFA with LDGSTS and gather/permutation + # Each K-tile iteration loads one K-tile of A and SFA from GMEM to SMEM + # using LDGSTS instructions with token-based gather addressing + # + for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1): + # Conditionally wait for AB buffer empty + a_pipeline.producer_acquire(a_producer_state, peek_a_empty_status) + + tAgA_ktile = tAgA[(None, None, a_producer_state.count)] + tAsA_ktile = tAsA_tiled[(None, None, None, a_producer_state.index)] + + tAgSFA_ktile = tAgSFA[(None, a_producer_state.count)] + tAsSFA_ktile = tAsSFA[ + ( + None, + None, + None, + None, + a_producer_state.index, + ) + ] + + for i in range(8): + # + # Load A matrix: 8x LDGSTS.128 per thread with swizzle_128B + # Each LDGSTS.128 loads 32 elements (128 bits) from GMEM to SMEM + # Global memory address is computed using token offset for gather operation + # Predicate mask guards against invalid token IDs (padding tokens marked as -1) + # + A_gmem_slice_offset = A_gmem_thread_offset + cute.assume( + a_token_offset_tensor[i] * tAgA_ktile.layout[0].stride, divby=32 + ) + A_gmem_slice_offset = cute.assume(A_gmem_slice_offset, divby=32) + tAgA_slice_ptr = tAgA_ktile.iterator + A_gmem_slice_offset + tAgA_slice = cute.make_tensor( + tAgA_slice_ptr, layout=cute.make_layout((32,)) + ) + + tAsA_slice = cute.make_tensor( + tAsA_ktile[(None, i, None)].iterator, layout=cute.make_layout((32,)) + ) + a_predicate_slice = cute.make_rmem_tensor( + cute.make_layout((1,)), cutlass.Boolean + ) + a_predicate_slice[0] = a_predicate_tensor[i] + + cute.copy_atom_call( + a_atom_copy, tAgA_slice, tAsA_slice, pred=a_predicate_slice + ) + + for i in range(4): + # + # Load SFA: 4x LDGSTS.32 per thread with 512-element block swizzling + # Each LDGSTS.32 loads 4 scale factor elements (32 bits) from GMEM to SMEM + # Uses same token offset as A matrix for consistent gather operation + # + swizzled_iterator = (tidx_in_warpgroup % 32) // 8 ^ i + tAgSFA_slice_ptr = tAgSFA_ktile.iterator + 4 * swizzled_iterator + tAgSFA_slice = cute.make_tensor( + tAgSFA_slice_ptr, layout=cute.make_layout((4,)) + ) + + tAsSFA_slice_ptr = tAsSFA_ktile.iterator + 512 * swizzled_iterator + tAsSFA_slice = cute.make_tensor(tAsSFA_slice_ptr, cute.make_layout((4,))) + + cute.copy_atom_call( + sfa_atom_copy, tAgSFA_slice, tAsSFA_slice, pred=sfa_predicate_tensor + ) + + # Signal the completion of async + if cutlass.const_expr(self.use_2cta_instrs): + cute.arch.cp_async_commit_group() + cute.arch.cp_async_wait_group(0) + a_pipeline.producer_commit(a_producer_state) + + # Peek (try_wait) A buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1 + a_producer_state.advance() + peek_a_empty_status = cutlass.Boolean(1) + if a_producer_state.count < k_tile_cnt: + peek_a_empty_status = a_pipeline.producer_try_acquire(a_producer_state) + + # + # Advance to next tile + # + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(5, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + + # + # Wait A pipeline buffer empty + # + a_pipeline.producer_tail(a_producer_state) + + # + # Specialized TMA B/SFB load warp (warp 9) + # This warp uses TMA instructions to load B and SFB from global to shared memory + # with multicast support to reduce L2 memory traffic + # + if warp_idx == self.tma_b_warp_id: + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + # First tile + work_tile = tile_sched.initial_work_tile_info() + + b_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_ab_stage + ) + + tile_info_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_tile_stage + ) + + # Get the first tile info + tile_info = cute.make_rmem_tensor((4,), cutlass.Int32) + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(4, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + + while is_valid_tile: + mma_tile_coord_mnl = ( + tile_info[0] // cute.size(tiled_mma.thr_id.shape), + tile_info[1], + tile_info[2], + ) + # + # Slice to per mma tile index + # + # ((atom_v, rest_v), loopK) + tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])] + + # ((atom_v, rest_v), RestK) + # tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)] + + # Apply SFB slicing hack when cta_tile_shape_n=64 # {$nv-internal-release} + slice_n = mma_tile_coord_mnl[1] + if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64): + slice_n = mma_tile_coord_mnl[1] // 2 + + # ((atom_v, rest_v), RestK) + tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])] + + # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + b_producer_state.reset_count() + peek_ab_empty_status = cutlass.Boolean(1) + if b_producer_state.count < k_tile_cnt: + peek_ab_empty_status = b_pipeline.producer_try_acquire(b_producer_state) + # + # Tma load loop + # + for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1): + # Conditionally wait for B buffer empty + b_pipeline.producer_acquire(b_producer_state, peek_ab_empty_status) + + tBgB_k = tBgB_slice[(None, b_producer_state.count)] + tBgSFB_k = tBgSFB_slice[(None, b_producer_state.count)] + tBsB_pipe = tBsB[(None, b_producer_state.index)] + tBsSFB_pipe = tBsSFB[(None, b_producer_state.index)] + + tma_bar = b_pipeline.producer_get_barrier(b_producer_state) + + # TMA load B + cute.copy( + tma_atom_b, + tBgB_k, + tBsB_pipe, + tma_bar_ptr=tma_bar, + mcast_mask=b_full_mcast_mask, + ) + + # TMA load SFB + cute.copy( + tma_atom_sfb, + tBgSFB_k, + tBsSFB_pipe, + tma_bar_ptr=tma_bar, + mcast_mask=sfb_full_mcast_mask, + ) + + # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1 + b_producer_state.advance() + peek_ab_empty_status = cutlass.Boolean(1) + if b_producer_state.count < k_tile_cnt: + peek_ab_empty_status = b_pipeline.producer_try_acquire(b_producer_state) + + # + # Advance to next tile + # + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(4, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + # + # Wait A/B buffer empty + # + b_pipeline.producer_tail(b_producer_state) + + # + # Specialized MMA warp + # + if warp_idx == self.mma_warp_id: + # + # Bar sync for retrieve tensor memory ptr from shared mem + # + tmem.wait_for_alloc() + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout) + + # Make SFA tmem tensor + sfa_tmem_ptr = cute.recast_ptr( + acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base), + dtype=self.sf_dtype, + ) + # (MMA, MMA_M, MMA_K) + tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa( + tiled_mma, + self.mma_tiler, + self.sf_vec_size, + cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)), + ) + tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout) + + # Make SFB tmem tensor + sfb_tmem_ptr = cute.recast_ptr( + acc_tmem_ptr + + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + + tcgen05.find_tmem_tensor_col_offset(tCtSFA), + dtype=self.sf_dtype, + ) + # (MMA, MMA_N, MMA_K) + tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb( + tiled_mma, + self.mma_tiler, + self.sf_vec_size, + cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)), + ) + tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout) + + # Partition for S2T copy of SFA/SFB + # + ( + tiled_copy_s2t_sfa, + tCsSFA_compact_s2t, + tCtSFA_compact_s2t, + ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA) + ( + tiled_copy_s2t_sfb, + tCsSFB_compact_s2t, + tCtSFB_compact_s2t, + ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB) + + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + a_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_ab_stage + ) + b_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_ab_stage + ) + acc_producer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Producer, self.num_acc_stage + ) + + tile_info_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_tile_stage + ) + + # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles) + tile_info = cute.make_rmem_tensor((4,), cutlass.Int32) + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(4, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + + while is_valid_tile: + # Peek (try_wait) AB buffer full for k_tile = 0 + a_consumer_state.reset_count() + b_consumer_state.reset_count() + peek_a_full_status = cutlass.Boolean(1) + peek_b_full_status = cutlass.Boolean(1) + if a_consumer_state.count < k_tile_cnt and is_leader_cta: + peek_a_full_status = a_pipeline.consumer_try_wait(a_consumer_state) + if b_consumer_state.count < k_tile_cnt and is_leader_cta: + peek_b_full_status = b_pipeline.consumer_try_wait(b_consumer_state) + + mma_tile_coord_mnl = ( + tile_info[0] // cute.size(tiled_mma.thr_id.shape), + tile_info[1], + tile_info[2], + ) + + tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)] + + # Apply TMEM pointer offset hack when cta_tile_shape_n=192 or + # cta_tile_shape_n=64 # {$nv-internal-release} + + tCtSFB_mma = tCtSFB + if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192): + # If this is an ODD tile, shift the TMEM start address for + # cta_tile_shape_n=192 case by two words + # (ignores first 64 columns of SFB) + offset = ( + cutlass.Int32(2) if mma_tile_coord_mnl[1] % 2 == 1 else cutlass.Int32(0) + ) + shifted_ptr = cute.recast_ptr( + acc_tmem_ptr + + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + + tcgen05.find_tmem_tensor_col_offset(tCtSFA) + + offset, + dtype=self.sf_dtype, + ) + tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout) + elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64): + # Move in increments of 64 columns of SFB + offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2) + shifted_ptr = cute.recast_ptr( + acc_tmem_ptr + + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + + tcgen05.find_tmem_tensor_col_offset(tCtSFA) + + offset, + dtype=self.sf_dtype, + ) + tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout) + # + # Wait for accumulator buffer empty + # + if is_leader_cta: + acc_pipeline.producer_acquire(acc_producer_state) + # + # Mma mainloop + # + + # + # Reset the ACCUMULATE field for each tile + # + tiled_mma.set(tcgen05.Field.ACCUMULATE, False) + + for k_tile in cutlass.range(k_tile_cnt): + # Set tensor memory buffer for current tile + # (MMA, MMA_M, MMA_N) + + if is_leader_cta: + # Conditionally wait for AB buffer full + a_pipeline.consumer_wait(a_consumer_state, peek_a_full_status) + b_pipeline.consumer_wait(b_consumer_state, peek_b_full_status) + + # Copy SFA/SFB from smem to tmem + s2t_stage_coord = ( + None, + None, + None, + None, + b_consumer_state.index, + ) + tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord] + tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord] + cute.copy( + tiled_copy_s2t_sfa, + tCsSFA_compact_s2t_staged, + tCtSFA_compact_s2t, + ) + cute.copy( + tiled_copy_s2t_sfb, + tCsSFB_compact_s2t_staged, + tCtSFB_compact_s2t, + ) + + # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB + num_kblocks = cute.size(tCrA, mode=[2]) + + for kblock_idx in cutlass.range(num_kblocks, unroll_full=True): + kblock_coord = ( + None, + None, + kblock_idx, + b_consumer_state.index, + ) + + # Set SFA/SFB tensor to tiled_mma + sf_kblock_coord = (None, None, kblock_idx) + tiled_mma.set( + tcgen05.Field.SFA, + tCtSFA[sf_kblock_coord].iterator, + ) + tiled_mma.set( + tcgen05.Field.SFB, + tCtSFB_mma[sf_kblock_coord].iterator, + ) + + cute.gemm( + tiled_mma, + tCtAcc, + tCrA[kblock_coord], + tCrB[kblock_coord], + tCtAcc, + ) + # Enable accumulate on tCtAcc after first kblock + tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + + # Async arrive AB buffer empty + a_pipeline.consumer_release(a_consumer_state) + b_pipeline.consumer_release(b_consumer_state) + + # Peek (try_wait) AB buffer full for k_tile = k_tile + 1 + a_consumer_state.advance() + b_consumer_state.advance() + peek_a_full_status = cutlass.Boolean(1) + if a_consumer_state.count < k_tile_cnt: + if is_leader_cta: + peek_a_full_status = a_pipeline.consumer_try_wait(a_consumer_state) + + peek_b_full_status = cutlass.Boolean(1) + if b_consumer_state.count < k_tile_cnt: + if is_leader_cta: + peek_b_full_status = b_pipeline.consumer_try_wait(b_consumer_state) + + # + # Async arrive accumulator buffer full(each kblock) + # + if is_leader_cta: + acc_pipeline.producer_commit(acc_producer_state) + + # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1 + acc_producer_state.advance() + + # + # Advance to next tile + # + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(4, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + # + # Wait for accumulator buffer empty + # + acc_pipeline.producer_tail(acc_producer_state) + + # + # Specialized epilogue warps + # + if warp_idx <= self.epilog_warp_id[-1]: + # + # Alloc tensor memory buffer + # + tmem.allocate(self.num_tmem_alloc_cols) + + # + # Bar sync for retrieve tensor memory ptr from shared memory + # + tmem.wait_for_alloc() + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + tmem_ptr = tmem.retrieve_ptr(self.acc_dtype) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout) + + # + # Partition for epilogue + # + epi_tidx = tidx % 128 + ( + tiled_copy_t2r, + tTR_tAcc_base, + tTR_rAcc_up, + tTR_rAcc_gate, + ) = self.epilog_tmem_copy_and_partition( + epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs + ) + + tTR_rC = None + tiled_copy_r2s = None + tRS_rC = None + tRS_sC = None + bSG_sC = None + bSG_gC_partitioned = None + tTR_rC = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.c_dtype) + tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition( + tiled_copy_t2r, tTR_rC, epi_tidx, sC + ) + ( + tma_atom_c, + bSG_sC, + bSG_gC_partitioned, + ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, epi_tile, sC) + + if cutlass.const_expr(self.generate_sfc): + norm_const = norm_const_tensor[0] + # (EPI_TILE_M, EPI_TILE_N, RestM, RestN, RestL) + gSFC_mnl = cute.local_tile(mSFC_mnl, epi_tile, (None, None, None)) + + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + # (T2R, T2R_M, T2R_N, RestM, RestN, RestL) + tCgSFC_mnl = thr_copy_t2r.partition_D(gSFC_mnl) + tCgSFC_mnl = cute.filter_zeros(tCgSFC_mnl) + # (T2R, T2R_M, T2R_N) + tCrSFC = cute.make_rmem_tensor( + tCgSFC_mnl[(None, None, None, 0, 0, 0)].layout, self.sf_dtype + ) + tCrSFC_pvscale = cute.make_rmem_tensor_like(tCrSFC, cutlass.Float32) + + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + acc_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_acc_stage + ) + + c_pipeline = None + # Threads/warps participating in tma store pipeline + c_producer_group = pipeline.CooperativeGroup( + pipeline.Agent.Thread, + 32 * len(self.epilog_warp_id), + ) + c_pipeline = pipeline.PipelineTmaStore.create( + num_stages=self.num_c_stage, + producer_group=c_producer_group, + ) + + tile_info_consumer_state = pipeline.make_pipeline_state( + pipeline.PipelineUserType.Consumer, self.num_tile_stage + ) + + # Get the first tile info + tile_info = cute.make_rmem_tensor((4,), cutlass.Int32) + + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(4, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + + num_prev_subtiles = cutlass.Int32(0) + while is_valid_tile: + mma_tile_coord_mnl = ( + tile_info[0] // cute.size(tiled_mma.thr_id.shape), + tile_info[1], + tile_info[2], + ) + # + # Get alpha for current group + # + + expert_idx = mma_tile_coord_mnl[2] + alpha_val = alpha[expert_idx] + + # + # Slice to per mma tile index + # + bSG_gC = None + # ((ATOM_V, REST_V), EPI_M, EPI_N) + bSG_gC = bSG_gC_partitioned[ + ( + None, + None, + None, + mma_tile_coord_mnl[0], + mma_tile_coord_mnl[1], + 0, + ) + ] + + # Set tensor memory buffer for current tile + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M) + tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_consumer_state.index)] + + if cutlass.const_expr(self.generate_sfc): + # (T2R, T2R_M, T2R_N, RestM, RestN) + tCgSFC_mn = tCgSFC_mnl[ + ( + None, + None, + None, + None, + None, + 0, + ) + ] + + # + # Wait for accumulator buffer full + # + acc_pipeline.consumer_wait(acc_consumer_state) + + tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc)) + bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC)) + + # + # Process accumulator subtiles with SwiGLU fusion and store to global memory + # Each iteration processes a pair of subtiles (up, gate) and computes + # up * silu(gate) + # + subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3]) + num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt + + for subtile_idx in cutlass.range(0, subtile_cnt, 2): + # + # Load accumulator from tensor memory buffer to register + # + tTR_tAcc_mn_up = tTR_tAcc[(None, None, None, subtile_idx)] + tTR_tAcc_mn_gate = tTR_tAcc[(None, None, None, subtile_idx + 1)] + + cute.copy(tiled_copy_t2r, tTR_tAcc_mn_up, tTR_rAcc_up) + cute.copy(tiled_copy_t2r, tTR_tAcc_mn_gate, tTR_rAcc_gate) + + acc_vec_up = tTR_rAcc_up.load() + acc_vec_gate = tTR_rAcc_gate.load() + + # + # SwiGLU activation: output = up * silu(gate) + # where silu(x) = x * sigmoid(x) + # up and gate are extracted from interleaved accumulator subtiles + # + tCompute = cute.make_rmem_tensor(acc_vec_gate.shape, self.acc_dtype) + if cutlass.const_expr(self.vectorized_f32): + # SwiGLU Packed Version: uses f32x2 packed operations for better performance + # Computes: output = (alpha * up) * silu(alpha * gate) + # where silu(x) = x * sigmoid(x) = x / (1 + exp(-x)) + LOG2_E = cutlass.Float32(1.4426950408889634) + for i in cutlass.range_constexpr(0, cute.size(tTR_rAcc_up), 2): + acc_vec_up_alpha = cute.arch.mul_packed_f32x2( + (acc_vec_up[i], acc_vec_up[i + 1]), + (cutlass.Float32(alpha_val), cutlass.Float32(alpha_val)), + ) + acc_vec_gate_alpha = cute.arch.mul_packed_f32x2( + (acc_vec_gate[i], acc_vec_gate[i + 1]), + (cutlass.Float32(alpha_val), cutlass.Float32(alpha_val)), + ) + tCompute_log2e = cute.arch.mul_packed_f32x2( + (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]), (-LOG2_E, -LOG2_E) + ) + ( + tCompute[i], + tCompute[i + 1], + ) = cute.arch.add_packed_f32x2( + ( + cute.math.exp2(tCompute_log2e[0], fastmath=True), + cute.math.exp2(tCompute_log2e[1], fastmath=True), + ), + (1.0, 1.0), + ) + tCompute[i] = cute.arch.rcp_approx(tCompute[i]) + tCompute[i + 1] = cute.arch.rcp_approx(tCompute[i + 1]) + ( + tCompute[i], + tCompute[i + 1], + ) = cute.arch.mul_packed_f32x2( + (tCompute[i], tCompute[i + 1]), + (acc_vec_gate_alpha[0], acc_vec_gate_alpha[1]), + ) + ( + tCompute[i], + tCompute[i + 1], + ) = cute.arch.mul_packed_f32x2( + (tCompute[i], tCompute[i + 1]), + (acc_vec_up_alpha[0], acc_vec_up_alpha[1]), + ) + else: + # SwiGLU Unpacked Version: scalar operations + # Computes: output = (alpha * up) * silu(alpha * gate) + for i in cutlass.range_constexpr(cute.size(tTR_rAcc_up)): + acc_vec_up_alpha = acc_vec_up[i] * cutlass.Float32(alpha_val) + acc_vec_gate_alpha = acc_vec_gate[i] * cutlass.Float32(alpha_val) + tCompute[i] = acc_vec_up_alpha * silu_f32( + acc_vec_gate_alpha, fastmath=True + ) + + if cutlass.const_expr(self.generate_sfc): + # + # Quantization path for Float4E2M1FN output: + # 1. Compute per-vector absolute max from SwiGLU result + # 2. Generate scale factor C (SFC) based on max values + # 3. Store SFC to global memory + # 4. Quantize output by scaling with reciprocal of SFC + # + # Assume subtile partitioned always happens on n dimension + sfc_subtile_idx_mn = ( + tile_info[0] * self.epi_tile_cnt[0], + tile_info[1] * self.epi_tile_cnt[1] + subtile_idx // 2, + ) + tCgSFC = tCgSFC_mn[ + ( + None, + None, + None, + *sfc_subtile_idx_mn, + ) + ] + + # + # Get absolute max across a vector and Compute SFC + # + tTR_rAcc_frg = cute.logical_divide( + tCompute, cute.make_layout(self.sf_vec_size) + ) + acc_frg = tTR_rAcc_frg.load() + acc_frg = epilogue_op(acc_frg) + + # Apply element-wise absolute value using math.absf (supports vectors) + abs_acc_frg_ir = math.absf(acc_frg.ir_value()) + abs_acc_frg = type(acc_frg)(abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype) + + if cutlass.const_expr(self.vectorized_f32): + for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]): + tCrSFC_pvscale[vi] = abs_acc_frg[None, vi].reduce( + cute.ReductionOp.MAX, + cutlass.Float32(0.0), + 0, # Use 0.0 as init for abs values + ) + for vi in cutlass.range_constexpr(0, abs_acc_frg.shape[1], 2): + tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = ( + cute.arch.mul_packed_f32x2( + (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]), + ( + self.get_dtype_rcp_limits(self.c_dtype), + self.get_dtype_rcp_limits(self.c_dtype), + ), + ) + ) + tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1] = ( + cute.arch.mul_packed_f32x2( + (tCrSFC_pvscale[vi], tCrSFC_pvscale[vi + 1]), + (norm_const, norm_const), + ) + ) + else: + for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]): + tCrSFC_pvscale[vi] = ( + abs_acc_frg[None, vi].reduce( + cute.ReductionOp.MAX, + cutlass.Float32(0.0), + 0, # Use 0.0 as init for abs values + ) + * self.get_dtype_rcp_limits(self.c_dtype) + * norm_const + ) + + # TODO: need to add f32x2 -> f8x2 conversion + tCrSFC.store(tCrSFC_pvscale.load().to(self.sf_dtype)) + + # + # Store SFC to global memory + # + # TODO: Need to think about predicate on it + # if cute.elem_less(): + cute.autovec_copy(tCrSFC, tCgSFC) + + # + # Compute quantized output values and convert to C type + # + # TODO: need to add f8x2 -> f32x2 conversion + tCrSFC_qpvscale_up = tCrSFC.load().to(cutlass.Float32) + fp32_max = cutlass.Float32(3.40282346638528859812e38) + if cutlass.const_expr(self.vectorized_f32): + for vi in cutlass.range_constexpr(0, cute.size(tCrSFC), 2): + acc_scale = cute.arch.mul_packed_f32x2( + ( + cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi]), + cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi + 1]), + ), + (norm_const, norm_const), + ) + acc_scale_min0 = fmin(acc_scale[0], fp32_max, nan=True) + acc_scale_min1 = fmin(acc_scale[1], fp32_max, nan=True) + + vec0 = tTR_rAcc_frg[None, vi] + vec1 = tTR_rAcc_frg[None, vi + 1] + for ei in cutlass.range_constexpr(self.sf_vec_size): + vec0[ei], vec1[ei] = cute.arch.mul_packed_f32x2( + (vec0[ei], vec1[ei]), + (acc_scale_min0, acc_scale_min1), + ) + else: + for vi in cutlass.range_constexpr(cute.size(tCrSFC)): + # TODO:Need to add E8M0 rcp approximation + acc_scale = norm_const * cute.arch.rcp_approx( + tCrSFC_qpvscale_up[vi] + ) + acc_scale = fmin(acc_scale, fp32_max, nan=True) + + vec = tTR_rAcc_frg[None, vi] + for ei in cutlass.range_constexpr(self.sf_vec_size): + vec[ei] = vec[ei] * acc_scale + + acc_vec = tiled_copy_r2s.retile(tCompute).load() + tRS_rC.store(acc_vec.to(self.c_dtype)) + else: + # + # Convert to C type + # + acc_vec = tiled_copy_r2s.retile(tCompute).load() + acc_vec = epilogue_op(acc_vec.to(self.c_dtype)) + tRS_rC.store(acc_vec) + + # + # Store C to shared memory + # + num_prev_subtiles = num_prev_subtiles + 1 + c_buffer = (num_prev_subtiles + subtile_idx // 2) % self.num_c_stage + + cute.copy( + tiled_copy_r2s, + tRS_rC, + tRS_sC[(None, None, None, c_buffer)], + ) + # Fence and barrier to make sure shared memory store is visible to TMA store + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + self.epilog_sync_barrier.arrive_and_wait() + # + # TMA store C to global memory + # + if warp_idx == self.epilog_warp_id[0]: + cute.copy( + tma_atom_c, + bSG_sC[(None, c_buffer)], + bSG_gC[(None, subtile_idx // 2)], + ) + # Fence and barrier to make sure shared memory store is visible to TMA store + c_pipeline.producer_commit() + c_pipeline.producer_acquire() + self.epilog_sync_barrier.arrive_and_wait() + + # + # Async arrive accumulator buffer empty + # + with cute.arch.elect_one(): + acc_pipeline.consumer_release(acc_consumer_state) + acc_consumer_state.advance() + + # + # Advance to next tile + # + tile_info_pipeline.consumer_wait(tile_info_consumer_state) + for idx in cutlass.range(4, unroll_full=True): + tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)] + is_valid_tile = tile_info[3] == 1 + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + tile_info_pipeline.consumer_release(tile_info_consumer_state) + tile_info_consumer_state.advance() + # + # Dealloc the tensor memory buffer + # + tmem.relinquish_alloc_permit() + self.epilog_sync_barrier.arrive_and_wait() + tmem.free(tmem_ptr) + # + # Wait for C store complete + # + c_pipeline.producer_tail() + + def epilog_tmem_copy_and_partition( + self, + tidx: cutlass.Int32, + tAcc: cute.Tensor, + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + use_2cta_instrs: Union[cutlass.Boolean, bool], + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for tensor memory load, then use it to partition tensor memory + (source) and register array (destination). + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param tAcc: The accumulator tensor to be copied and partitioned + :type tAcc: cute.Tensor + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param use_2cta_instrs: Whether use_2cta_instrs is enabled + :type use_2cta_instrs: bool + + :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate) where: + - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + - tTR_tAcc: The partitioned accumulator tensor + - tTR_rAcc_up: The partitioned accumulator tensor for acc up + - tTR_rAcc_gate: The partitioned accumulator tensor for acc gate + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor] + """ + # Make tiledCopy for tensor memory load + copy_atom_t2r = sm100_utils.get_tmem_load_op( + self.cta_tile_shape_mnk, + self.c_layout, + self.c_dtype, + self.acc_dtype, + epi_tile, + use_2cta_instrs, + ) + + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE) + tAcc_epi = cute.flat_divide( + tAcc[((None, None), 0, 0, None)], + epi_tile, + ) + # (EPI_TILE_M, EPI_TILE_N) + tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]) + + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE) + tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi) + + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_mnl_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile) + + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL) + tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi) + + # (T2R, T2R_M, T2R_N) + tTR_rAcc_up = cute.make_rmem_tensor( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype + ) + # (T2R, T2R_M, T2R_N) + tTR_rAcc_gate = cute.make_rmem_tensor( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype + ) + return tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate + + def epilog_smem_copy_and_partition( + self, + tiled_copy_t2r: cute.TiledCopy, + tTR_rC: cute.Tensor, + tidx: cutlass.Int32, + sC: cute.Tensor, + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for shared memory store, then use it to partition register + array (source) and shared memory (destination). + + :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + :type tiled_copy_t2r: cute.TiledCopy + :param tTR_rC: The partitioned accumulator tensor + :type tTR_rC: cute.Tensor + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + :type sepi: cute.Tensor + + :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where: + - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s) + - tRS_rC: The partitioned tensor C (register source) + - tRS_sC: The partitioned tensor C (smem destination) + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + copy_atom_r2s = sm100_utils.get_smem_store_op( + self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r + ) + tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r) + # (R2S, R2S_M, R2S_N, PIPE_D) + thr_copy_r2s = tiled_copy_r2s.get_slice(tidx) + tRS_sC = thr_copy_r2s.partition_D(sC) + # (R2S, R2S_M, R2S_N) + tRS_rC = tiled_copy_r2s.retile(tTR_rC) + return tiled_copy_r2s, tRS_rC, tRS_sC + + def epilog_gmem_copy_and_partition( + self, + tidx: cutlass.Int32, + atom: Union[cute.CopyAtom, cute.TiledCopy], + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + sC: cute.Tensor, + ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]: + """Make tiledCopy for global memory store, then use it to: + - partition register array (source) and global memory (destination) for none TMA store version; + - partition shared memory (source) and global memory (destination) for TMA store version. + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version + :type atom: cute.CopyAtom or cute.TiledCopy + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + + :return: A tuple containing : + - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where: + - tma_atom_c: The TMA copy atom + - bSG_sC: The partitioned shared memory tensor C + - bSG_gC: The partitioned global tensor C + :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor] + """ + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile) + tma_atom_c = atom + sC_for_tma_partition = cute.group_modes(sC, 0, 2) + gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2) + # ((ATOM_V, REST_V), EPI_M, EPI_N) + # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL) + bSG_sC, bSG_gC = cpasync.tma_partition( + tma_atom_c, + 0, + cute.make_layout(1), + sC_for_tma_partition, + gC_for_tma_partition, + ) + return tma_atom_c, bSG_sC, bSG_gC + + @staticmethod + def _compute_stages( + tiled_mma: cute.TiledMma, + mma_tiler_mnk: Tuple[int, int, int], + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + epi_tile: cute.Tile, + c_dtype: Type[cutlass.Numeric], + c_layout: utils.LayoutEnum, + sf_dtype: Type[cutlass.Numeric], + sf_vec_size: int, + num_smem_capacity: int, + occupancy: int, + ) -> Tuple[int, int, int]: + """Computes the number of stages for A/B/C operands based on heuristics. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler. + :type mma_tiler_mnk: tuple[int, int, int] + :param a_dtype: Data type of operand A. + :type a_dtype: type[cutlass.Numeric] + :param b_dtype: Data type of operand B. + :type b_dtype: type[cutlass.Numeric] + :param epi_tile: The epilogue tile shape. + :type epi_tile: cute.Tile + :param c_dtype: Data type of operand C (output). + :type c_dtype: type[cutlass.Numeric] + :param c_layout: Layout of operand C. + :type c_layout: utils.LayoutEnum + :param sf_dtype: Data type of scale factor. + :type sf_dtype: type[cutlass.Numeric] + :param sf_vec_size: Vector size of scale factor. + :type sf_vec_size: int + :param num_smem_capacity: Total available shared memory capacity in bytes. + :type num_smem_capacity: int + :param occupancy: Target number of CTAs per SM (occupancy). + :type occupancy: int + + :return: A tuple containing the computed number of stages for: + (ACC stages, A/B operand stages, C stages) + :rtype: tuple[int, int, int] + """ + # Default ACC stages + num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2 + + # Default C stages + num_c_stage = 2 + + # Default Tile info stages + num_tile_stage = 2 + + # Calculate smem layout and size for one stage of A, B, and C + a_smem_layout_stage_one = sm100_utils.make_smem_layout_a( + tiled_mma, + mma_tiler_mnk, + a_dtype, + 1, # a tmp 1 stage is provided + ) + b_smem_layout_staged_one = sm100_utils.make_smem_layout_b( + tiled_mma, + mma_tiler_mnk, + b_dtype, + 1, # a tmp 1 stage is provided + ) + + sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa( + tiled_mma, + mma_tiler_mnk, + sf_vec_size, + 1, # a tmp 1 stage is provided + ) + + sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb( + tiled_mma, + mma_tiler_mnk, + sf_vec_size, + 1, # a tmp 1 stage is provided + ) + + c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi( + c_dtype, + c_layout, + epi_tile, + 1, + ) + + ab_bytes_per_stage = ( + cute.size_in_bytes(a_dtype, a_smem_layout_stage_one) + + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one) + + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one) + + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one) + ) + # 1024B alignment + mbar_helpers_bytes = 1024 + c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one) + c_bytes = c_bytes_per_stage * num_c_stage + + # Calculate A/B stages: + # Start with total smem per CTA (capacity / occupancy) + # Subtract reserved bytes and initial C stages bytes + # Divide remaining by bytes needed per A/B stage + # cute.printf("num_smem_capacity: {}, occupancy: {}, " + # "mbar_helpers_bytes: {}, c_bytes: {}", + # num_smem_capacity, occupancy, mbar_helpers_bytes, c_bytes) + # cute.printf("ab_bytes_per_stage: {}", ab_bytes_per_stage) + num_ab_stage = ( + num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes) + ) // ab_bytes_per_stage + + # Refine epilogue stages: + # Calculate remaining smem after allocating for A/B stages and reserved bytes + # Add remaining unused smem to epilogue + num_c_stage += ( + num_smem_capacity + - occupancy * ab_bytes_per_stage * num_ab_stage + - occupancy * (mbar_helpers_bytes + c_bytes) + ) // (occupancy * c_bytes_per_stage) + return num_acc_stage, num_ab_stage, num_c_stage, num_tile_stage + + @staticmethod + def _compute_grid( + c: cute.Tensor, + cta_tile_shape_mnk: Tuple[int, int, int], + cluster_shape_mn: Tuple[int, int], + max_active_clusters: cutlass.Constexpr, + ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]: + """Use persistent tile scheduler to compute the grid size for the output tensor C. + + :param c: The output tensor C + :type c: cute.Tensor + :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile. + :type cta_tile_shape_mnk: tuple[int, int, int] + :param cluster_shape_mn: Shape of each cluster in M, N dimensions. + :type cluster_shape_mn: tuple[int, int] + :param max_active_clusters: Maximum number of active clusters. + :type max_active_clusters: cutlass.Constexpr + + :return: A tuple containing: + - tile_sched_params: Parameters for the persistent tile scheduler. + - grid: Grid shape for kernel launch. + :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]] + """ + c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0)) + gc = cute.zipped_divide(c, tiler=c_shape) + num_ctas_mnl = gc[(0, (None, None, None))].shape + cluster_shape_mnl = (*cluster_shape_mn, 1) + + tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl) + grid = utils.StaticPersistentTileScheduler.get_grid_shape( + tile_sched_params, max_active_clusters + ) + + return tile_sched_params, grid + + @staticmethod + def _get_tma_atom_kind( + atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean + ) -> Union[cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp]: + """ + Select the appropriate TMA copy atom based on the number of SMs and the multicast flag. + + :param atom_sm_cnt: The number of SMs + :type atom_sm_cnt: cutlass.Int32 + :param mcast: The multicast flag + :type mcast: cutlass.Boolean + + :return: The appropriate TMA copy atom kind + :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp + + :raise ValueError: If the atom_sm_cnt is invalid + """ + if atom_sm_cnt == 2 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 2 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 1 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE) + elif atom_sm_cnt == 1 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE) + + raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}") + + @staticmethod + def get_dtype_rcp_limits(dtype: Type[cutlass.Numeric]) -> float: + """ + Calculates the reciprocal of the maximum absolute value for a given data type. + + :param dtype: Data type + :type dtype: Type[cutlass.Numeric] + + :return: An float representing the reciprocal of the maximum absolute value + :rtype: float + """ + if dtype == cutlass.Float4E2M1FN: + return 1 / 6.0 + if dtype == cutlass.Float8E4M3FN: + return 1 / 448.0 + if dtype == cutlass.Float8E5M2: + return 1 / 128.0 + return 1.0 + + @staticmethod + def is_valid_dtypes_and_scale_factor_vec_size( + ab_dtype: Type[cutlass.Numeric], + sf_dtype: Type[cutlass.Numeric], + sf_vec_size: int, + acc_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + ) -> bool: + """ + Check if the dtypes are valid + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param sf_dtype: The data type of the scale factor + :type sf_dtype: Type[cutlass.Numeric] + :param sf_vec_size: The vector size of the scale factor + :type sf_vec_size: int + :param acc_dtype: The data type of the accumulator + :type acc_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + + :return: True if the dtypes are valid, False otherwise + :rtype: bool + """ + is_valid = True + if ab_dtype not in { + cutlass.Float4E2M1FN, + cutlass.Float8E5M2, + cutlass.Float8E4M3FN, + }: + is_valid = False + + # Check valid sf_vec_size + if sf_vec_size not in {16, 32}: + is_valid = False + + # Check valid sf_dtype + if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}: + is_valid = False + + # Check valid sf_dtype and sf_vec_size combinations + if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32: + is_valid = False + if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16: + is_valid = False + + if acc_dtype not in {cutlass.Float32}: + is_valid = False + # Check valid c_dtype + if c_dtype not in { + cutlass.Float32, + cutlass.Float16, + cutlass.BFloat16, + cutlass.Float8E5M2, + cutlass.Float8E4M3FN, + cutlass.Float4E2M1FN, + }: + is_valid = False + + return is_valid + + @staticmethod + def is_valid_layouts( + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + ) -> bool: + """ + Check if layouts and dtypes are valid combinations + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param a_major: The major dimension of the A tensor + :type a_major: str + :param b_major: The major dimension of the B tensor + :type b_major: str + :param c_major: The major dimension of the C tensor + :type c_major: str + + :return: True if the layouts are valid, False otherwise + :rtype: bool + """ + is_valid = True + + if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"): + is_valid = False + # TODO: Currently we don't support m major output for Float4E2M1FN, + # Need to support it in the future. + if c_dtype is cutlass.Float4E2M1FN and c_major == "m": + is_valid = False + return is_valid + + @staticmethod + def is_valid_mma_tiler_and_cluster_shape( + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + m_aligned: int, + ) -> bool: + """ + Check if the mma tiler and cluster shape are valid + + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster + :type cluster_shape_mn: Tuple[int, int] + :param m_aligned: The alignment requirement for group M dimension (default: 128) + :type m_aligned: int + + :return: True if the mma tiler and cluster shape are valid, False otherwise + :rtype: bool + """ + is_valid = True + + # Skip invalid mma tile shape + if not ( + (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128]) + or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256]) + ): + is_valid = False + # Skip invalid mma tile n + # Needs to have even iterations with Epi Tile N 64 for swiGeLU fusion + if mma_tiler_mn[1] not in (128, 256): + is_valid = False + # Skip illegal cluster shape + if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0: + is_valid = False + # Skip invalid cluster shape + if ( + cluster_shape_mn[0] * cluster_shape_mn[1] > 16 + or cluster_shape_mn[0] <= 0 + or cluster_shape_mn[1] <= 0 + # Special cluster shape check for scale factor multicasts. + # Due to limited size of scale factors, we can't multicast among more than 4 CTAs. + or cluster_shape_mn[0] > 4 + or cluster_shape_mn[1] > 4 + or not is_power_of_2(cluster_shape_mn[0]) + or not is_power_of_2(cluster_shape_mn[1]) + ): + is_valid = False + cluster_tiler_m = (cluster_shape_mn[0] // (2 if use_2cta_instrs else 1)) * mma_tiler_mn[0] + + # Skip invalid cluster tiler shape since contiguous layout can't handle oob access + # The contiguous layout means the aligned data is stored in a contiguous manner. + # It can't handle runtime oob when alignment is not align with the tile_M, + # since the problem shape of TMA store can't be changed at runtime. + if cluster_tiler_m not in [64, 128, 256]: + is_valid = False + + # Check if m_aligned is a multiple of cluster_tiler_m + # This ensures that each group's M dimension (which is a multiple of m_aligned) + # won't be split across tiles, preventing a single tile from loading data + # from multiple groups (which would access wrong B matrix data) + if m_aligned % mma_tiler_mn[0] != 0: + is_valid = False + + return is_valid + + @staticmethod + def is_valid_tensor_alignment( + m: int, + n: int, + k: int, + l: int, # noqa: E741 + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + ) -> bool: + """ + Check if the tensor alignment is valid + + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param k: The number of columns in the A tensor + :type k: int + :param l: The number of columns in the C tensor + :type l: int + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param a_major: The major axis of the A tensor + :type a_major: str + :param b_major: The major axis of the B tensor + :type b_major: str + :param c_major: The major axis of the C tensor + :type c_major: str + + :return: True if the problem shape is valid, False otherwise + :rtype: bool + """ + is_valid = True + + def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape): + major_mode_idx = 0 if is_mode0_major else 1 + num_major_elements = tensor_shape[major_mode_idx] + num_contiguous_elements = 16 * 8 // dtype.width + return num_major_elements % num_contiguous_elements == 0 + + if ( + not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l)) + or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l)) + or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l)) + ): + is_valid = False + return is_valid + + @staticmethod + def can_implement( + ab_dtype: Type[cutlass.Numeric], + sf_dtype: Type[cutlass.Numeric], + sf_vec_size: int, + acc_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + m: int, + n: int, + k: int, + l: int, # noqa: E741 + a_major: str, + b_major: str, + c_major: str, + m_aligned: int, + ) -> bool: + """ + Check if the gemm can be implemented + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param sf_dtype: The data type of the scale factor + :type sf_dtype: Type[cutlass.Numeric] + :param sf_vec_size: The vector size of the scale factor + :type sf_vec_size: int + :param acc_dtype: The data type of the accumulator + :type acc_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster + :type cluster_shape_mn: Tuple[int, int] + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param k: The number of columns in the A tensor + :type k: int + :param l: The number of columns in the C tensor + :type l: int + :param a_major: The major axis of the A tensor + :type a_major: str + :param b_major: The major axis of the B tensor + :type b_major: str + :param c_major: The major axis of the C tensor + :type c_major: str + :param m_aligned: The alignment requirement for group M dimension (default: 128) + :type m_aligned: int + + :return: True if the gemm can be implemented, False otherwise + :rtype: bool + """ + can_implement = True + # Skip unsupported types + if not BlockScaledContiguousGatherGroupedGemmKernel.is_valid_dtypes_and_scale_factor_vec_size( + ab_dtype, sf_dtype, sf_vec_size, acc_dtype, c_dtype + ): + can_implement = False + + # Skip unsupported layouts + if not BlockScaledContiguousGatherGroupedGemmKernel.is_valid_layouts( + ab_dtype, c_dtype, a_major, b_major, c_major + ): + can_implement = False + + use_2cta_instrs = mma_tiler_mn[0] == 256 + # Skip invalid mma tile shape and cluster shape + if not BlockScaledContiguousGatherGroupedGemmKernel.is_valid_mma_tiler_and_cluster_shape( + use_2cta_instrs, mma_tiler_mn, cluster_shape_mn, m_aligned + ): + can_implement = False + # Skip illegal problem shape for load/store alignment + if not BlockScaledContiguousGatherGroupedGemmKernel.is_valid_tensor_alignment( + m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major + ): + can_implement = False + # Skip unsupported A/B layout + if not (a_major == "k" and b_major == "k"): + can_implement = False + return can_implement + + @cute.jit + def wrapper( + self, + a_ptr: cute.Pointer, + b_ptr: cute.Pointer, + a_sf_ptr: cute.Pointer, + b_sf_ptr: cute.Pointer, + c_ptr: cute.Pointer, + c_sf_ptr: cute.Pointer, + alpha_ptr: cute.Pointer, + tile_idx_to_group_idx_ptr: cute.Pointer, + tile_idx_to_mn_limit_ptr: cute.Pointer, + token_id_mapping_ptr: cute.Pointer, + num_non_exiting_tiles_ptr: cute.Pointer, + global_sf_ptr: cute.Pointer, + orig_m: int, + m: int, + n: int, + k: int, + l: int, # noqa: E741 + tile_size: cutlass.Constexpr, + scaling_vector_size: cutlass.Constexpr, + max_active_clusters: cutlass.Constexpr, + stream: cuda.CUstream, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + scale_k = k // scaling_vector_size + interm_size = n // 2 + num_tiles = m // tile_size + a = cute.make_tensor( + a_ptr, layout=cute.make_ordered_layout((orig_m, k, 1), order=(1, 0, 2)) + ) + b = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2))) + a_sf = cute.make_tensor( + a_sf_ptr, layout=cute.make_ordered_layout((orig_m, scale_k, 1), order=(1, 0, 2)) + ) + b_sf = cute.make_tensor( + b_sf_ptr, + layout=cute.make_ordered_layout( + (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5) + ), + ) + c = cute.make_tensor( + c_ptr, layout=cute.make_ordered_layout((m, interm_size, 1), order=(1, 0, 2)) + ) + c_sf = cute.make_tensor( + c_sf_ptr, + layout=cute.make_ordered_layout( + (32, 4, m // 128, 4, interm_size // (scaling_vector_size * 4), l), + order=(2, 1, 4, 0, 3, 5), + ), + ) + alpha = cute.make_tensor(alpha_ptr, layout=cute.make_layout((l,))) + + tile_idx_to_group_idx = cute.make_tensor( + tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,)) + ) + tile_idx_to_mn_limit = cute.make_tensor( + tile_idx_to_mn_limit_ptr, layout=cute.make_layout((num_tiles,)) + ) + token_id_mapping = cute.make_tensor(token_id_mapping_ptr, layout=cute.make_layout((m,))) + num_non_exiting_tiles = cute.make_tensor( + num_non_exiting_tiles_ptr, layout=cute.make_layout((1,)) + ) + global_sf = cute.make_tensor(global_sf_ptr, layout=cute.make_layout((1,))) + + return self( + a, + b, + c, + a_sf, + b_sf, + c_sf, + global_sf, + tile_idx_to_group_idx, + tile_idx_to_mn_limit, + token_id_mapping, + num_non_exiting_tiles, + alpha, + max_active_clusters=max_active_clusters, + stream=stream, + epilogue_op=epilogue_op, + ) diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py index 5877a31132..009eb2f730 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py @@ -48,8 +48,8 @@ from typing import Optional import cutlass.cute as cute from cutlass.cutlass_dsl import Boolean, if_generate -from cutlass.pipeline import (CooperativeGroup, PipelineAsync, PipelineOp, - PipelineState) +from cutlass.pipeline import (Agent, CooperativeGroup, PipelineAsync, + PipelineOp, PipelineState, agent_sync) def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None): @@ -374,3 +374,153 @@ class PipelineUmmaAsync(PipelineAsync): self.producer_acquire(state) if_generate(is_leader_cta, then_body) + + +@dataclass(frozen=True) +class PipelineCpAsyncUmma(PipelineAsync): + """ + PipelineCpAsyncUmma is used for LDGSTS (CpAsync) producers and UMMA consumers. + + This pipeline is specifically designed for scenarios where: + - Producers use LDGSTS instructions (cp.async) to load data from global to shared memory + - Consumers are UMMA warps that perform MMA operations using the loaded data + + Key differences from PipelineAsyncUmma: + - Suitable for gather/permutation operations during load + - Used in this kernel for A and SFA matrices with token-based gather addressing + """ + + cta_group: cute.nvgpu.tcgen05.CtaGroup + + @staticmethod + def _compute_leading_cta_rank(cta_v_size): + """ + Computes the leading CTA rank. + """ + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster()) + return cta_rank_in_cluster // cta_v_size * cta_v_size + + @staticmethod + def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout): + """ + Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders. + """ + bidx, bidy, _ = cute.arch.block_idx() + mma_coord_vmnk = ( + bidx % cute.size(cta_layout_vmnk, mode=[0]), + bidx // cute.size(cta_layout_vmnk, mode=[0]), + bidy, + None, + ) + return mma_coord_vmnk[0] == 0 + + @staticmethod + def _compute_peer_cta_mask(cta_layout_vmnk: cute.Layout): + """ + Computes a mask for signaling arrivals to multicasting threadblocks. + """ + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster()) + cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord( + cta_rank_in_cluster) + mask_self = cute.nvgpu.cpasync.create_tma_multicast_mask( + cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=0) + block_in_cluster_coord_vmnk_peer = ( + cta_in_cluster_coord_vmnk[0] ^ 1, + *cta_in_cluster_coord_vmnk[1:], + ) + mask_peer = cute.nvgpu.cpasync.create_tma_multicast_mask( + cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=0) + return mask_self | mask_peer + + @staticmethod + def create( + *, + num_stages: int, + producer_group: CooperativeGroup, + consumer_group: CooperativeGroup, + barrier_storage: cute.Pointer = None, + cta_layout_vmnk: Optional[cute.Layout] = None, + defer_sync: bool = False, + enable_cp_async: bool = False, + ): + """Creates and initializes a new PipelineCpAsyncUmma instance. + + :param num_stages: Number of buffer stages for this pipeline + :type num_stages: int + :param producer_group: CooperativeGroup for the producer agent + :type producer_group: CooperativeGroup + :param consumer_group: CooperativeGroup for the consumer agent + :type consumer_group: CooperativeGroup + :param barrier_storage: Pointer to the shared memory address for this pipeline's mbarriers + :type barrier_storage: cute.Pointer, optional + :param cta_layout_vmnk: Layout of the cluster shape + :type cta_layout_vmnk: cute.Layout, optional + :param defer_sync: Whether to defer the sync + :type defer_sync: bool, optional + :param enable_cp_async: Whether to enable cp.async instructions + :type enable_cp_async: bool, optional + :raises ValueError: If barrier_storage is not a cute.Pointer instance + :return: A new PipelineCpAsyncUmma instance configured with the provided parameters + :rtype: PipelineCpAsyncUmma + """ + if not isinstance(barrier_storage, cute.Pointer): + raise ValueError( + f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}" + ) + + producer_type = PipelineOp.AsyncLoad if enable_cp_async else PipelineOp.AsyncThread + consumer_type = PipelineOp.TCGen05Mma + + producer = (producer_type, producer_group) + consumer = (consumer_type, consumer_group) + + sync_object_full = PipelineAsync._make_sync_object( + barrier_storage.align(min_align=8), + num_stages, + producer, + ) + sync_object_empty = PipelineAsync._make_sync_object( + barrier_storage.align(min_align=8) + num_stages, num_stages, + consumer) + + cta_v_size = cute.size(cta_layout_vmnk, + mode=[0]) if cta_layout_vmnk is not None else 1 + cta_group = (cute.nvgpu.tcgen05.CtaGroup.ONE if cta_layout_vmnk is None + or cute.size(cta_layout_vmnk, mode=[0]) == 1 else + cute.nvgpu.tcgen05.CtaGroup.TWO) + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1: + # No mcast mask if we're not using 2CTA tcgen05 MMA + producer_mask = None + consumer_mask = None + else: + # If we're using 2CTA UMMAs, producer will arrive the mbar on leading CTA + # We need to get the target cta_rank + producer_mask = PipelineCpAsyncUmma._compute_leading_cta_rank( + cta_v_size) + # consumer needs to get the mask to signal + consumer_mask = PipelineCpAsyncUmma._compute_peer_cta_mask( + cta_layout_vmnk) + + if not defer_sync: + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1: + agent_sync(Agent.ThreadBlock) + else: + agent_sync(Agent.ThreadBlockCluster, is_relaxed=True) + + return PipelineCpAsyncUmma( + sync_object_full, + sync_object_empty, + num_stages, + producer_mask, + consumer_mask, + cta_group, + ) + + def consumer_release(self, state: PipelineState): + """ + UMMA consumer release buffer empty, cta_group needs to be provided. + """ + self.sync_object_empty.arrive(state.index, self.consumer_mask, + self.cta_group) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py index a087a4c87a..0ecd3e3e85 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py @@ -273,22 +273,16 @@ class CuteDslFusedMoE(CutlassFusedMoE): local_num_experts=self.expert_size_per_partition, tile_tokens_dim=tile_size, ) - x, x_sf = torch.ops.trtllm.moe_permute( - input=x.view(torch.float4_e2m1fn_x2), - input_sf=x_sf, - tile_idx_to_mn_limit=tile_idx_to_mn_limit, - permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx, - num_non_exiting_tiles=num_non_exiting_tiles, - tile_tokens_dim=tile_size, - top_k=self.routing_method.experts_per_token, - ) - x, x_sf = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_swiglu_blackwell( + + x, x_sf = torch.ops.trtllm.cute_dsl_nvfp4_gather_grouped_gemm_swiglu_blackwell( input=x.view(torch.float4_e2m1fn_x2), weight=self.w3_w1_weight.view(torch.float4_e2m1fn_x2), input_scale=x_sf.view(torch.uint8), weight_scale=self.quant_scales.fc1_weight_block.view(torch.uint8), alpha=self.quant_scales.fc1_global, tile_idx_to_group_idx=tile_idx_to_expert_idx, + tile_idx_to_mn_limit=tile_idx_to_mn_limit, + permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx, num_non_exiting_tiles=num_non_exiting_tiles, global_sf=self.fc2_input_scale, num_experts=self.num_slots, diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index 5f77a4c7a1..dac655b1c3 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -291,6 +291,15 @@ def fp4_scale_infer_shape(input_shapes: List[List[int]]): return scale_shape * 2 +def fp4_unswizzled_scale_infer_shape(input_shapes: List[List[int]]): + """Calculate the dimensions of the fp4 scale tensor. + """ + out_shape, scale_shape = fp4_utils.get_fp4_shape(input_shapes[0], + sf_vec_size=16, + is_swizzled_layout=False) + return scale_shape * 2 + + _enable_piecewise_cuda_graph = True diff --git a/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py index 4faec5d6f1..f146573ff0 100644 --- a/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py +++ b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py @@ -1,7 +1,11 @@ import pytest import torch +from utils.util import check_accuracy -from tensorrt_llm._torch.custom_ops.cute_dsl_custom_ops import GroupedGemmInputsHelper +from tensorrt_llm._torch.custom_ops.cute_dsl_custom_ops import ( + GatherGroupedGemmInputsHelper, + GroupedGemmInputsHelper, +) from tensorrt_llm._torch.modules.fused_moe.fused_moe_cute_dsl import cute_dsl_nvfp4_grouped_gemm_ref from tensorrt_llm._torch.modules.fused_moe.quantization import interleave_linear_and_gate from tensorrt_llm._torch.utils import swizzle_sf, unswizzle_sf @@ -707,3 +711,204 @@ def test_nvfp4_grouped_gemm_swiglu_blackwell( c_sf[:num_sf_elements] == c_sf_ref[:num_sf_elements] ).sum().item() / num_sf_elements assert match_ratio > 0.95 + + +@pytest.mark.skipif( + get_sm_version() not in (100, 103), + reason="This test is only supported on SM 100 and SM 103 GPUs", +) +@pytest.mark.parametrize("tile_size", [128, 256]) +@pytest.mark.parametrize("ep_size", [1, 8, 32]) +@pytest.mark.parametrize("top_k", [1, 2, 8]) +@pytest.mark.parametrize("num_tokens", [128, 515, 1024, 8192]) +def test_nvfp4_gather_grouped_gemm_swiglu_blackwell( + num_tokens: int, top_k: int, ep_size: int, tile_size: int +): + """Test gather-based grouped GEMM with SwiGLU fusion. + + This test validates the gather kernel which: + 1. Uses LDGSTS for A/SFA loading with permuted_idx_to_expanded_idx + 2. Performs GEMM with interleaved weights + 3. Applies SwiGLU activation fusion + 4. Quantizes output to FP4 with scale factor generation + """ + sf_vec_size = 16 + hidden_size = 4096 + interm_size = 8192 + num_experts = 256 + num_local_experts = num_experts // ep_size + + # Generate routing information + routing_logits = torch.randn(num_tokens, num_experts, device="cuda") + _, token_selected_experts = routing_logits.topk(top_k, dim=-1) + token_selected_experts = token_selected_experts.to(torch.int32) + num_tokens_per_expert = torch.bincount(token_selected_experts.flatten(), minlength=num_experts) + num_tokens_per_expert = num_tokens_per_expert[:num_local_experts] + # Ensure at least one valid token + if num_tokens_per_expert.sum().item() == 0: + num_tokens_per_expert[0] = 1 + num_tiles_per_expert = (num_tokens_per_expert + tile_size - 1) // tile_size + num_tokens_per_expert = num_tokens_per_expert.cpu() + num_tiles_per_expert = num_tiles_per_expert.cpu() + num_valid_tiles = num_tiles_per_expert.sum().item() + num_valid_permuted_tokens = num_valid_tiles * tile_size + + # Create helper + helper = GatherGroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size) + max_num_tiles = helper.get_max_num_tiles(num_tokens) + max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens) + assert 0 <= num_valid_tiles <= max_num_tiles + assert 0 <= num_valid_permuted_tokens <= max_num_permuted_tokens + + # Generate tile metadata + num_non_exiting_tiles = torch.tensor([num_valid_tiles], dtype=torch.int32, device="cuda") + tile_idx_to_group_idx = torch.empty(max_num_tiles, dtype=torch.int32) + tile_idx_to_mn_limit = torch.empty(max_num_tiles, dtype=torch.int32) + tile_idx_to_group_idx.fill_(int(-2e9)) + tile_idx_to_mn_limit.fill_(int(-2e9)) + + tile_idx_to_group_idx_list = helper.generate_tile_idx_to_group_idx( + num_tokens_per_expert.tolist() + ) + tile_idx_to_mn_limit_list = helper.generate_tile_idx_to_mn_limit(num_tokens_per_expert.tolist()) + + for idx, (group_idx, mn_limit) in enumerate( + zip(tile_idx_to_group_idx_list, tile_idx_to_mn_limit_list) + ): + tile_idx_to_group_idx[idx] = group_idx + tile_idx_to_mn_limit[idx] = mn_limit + + tile_idx_to_group_idx = tile_idx_to_group_idx.cuda() + tile_idx_to_mn_limit = tile_idx_to_mn_limit.cuda() + + # Generate permuted_idx_to_expanded_idx for gather operation + permuted_idx_to_expanded_idx_list = helper.generate_permuted_idx_to_expanded_idx( + num_tokens, num_tokens_per_expert.tolist(), max_num_permuted_tokens + ) + permuted_idx_to_expanded_idx = torch.tensor( + permuted_idx_to_expanded_idx_list, dtype=torch.int32, device="cuda" + ) + assert permuted_idx_to_expanded_idx.size(0) == max_num_permuted_tokens + + # Create input tensors (original size, not permuted) + a = torch.randint(-5, 5, (num_tokens, hidden_size), dtype=torch.int32, device="cuda").to( + torch.bfloat16 + ) + b = torch.randint( + -5, + 5, + (num_local_experts, interm_size * 2, hidden_size), + dtype=torch.int32, + device="cuda", + ).to(torch.bfloat16) + + # Quantize inputs to FP4 + a_global_sf = a.abs().max().float() / (448 * 6) + b_global_sf = b.abs().amax(dim=(1, 2)).float() / (448 * 6) + a, a_sf = torch.ops.trtllm.fp4_quantize(a, 1 / a_global_sf, sf_vec_size, False) + a = a.view(torch.float4_e2m1fn_x2) + a_sf_unswizzled = unswizzle_sf(a_sf, (num_tokens + 127) // 128 * 128, hidden_size)[:num_tokens] + b, b_sf = torch.ops.trtllm.fp4_quantize(b, 1 / b_global_sf, sf_vec_size, False) + b = b.view(torch.float4_e2m1fn_x2) + b_sf = b_sf.view(num_local_experts, interm_size * 2, hidden_size // sf_vec_size) + alpha = a_global_sf * b_global_sf + + # Interleave weights for SwiGLU + b_interleaved = interleave_linear_and_gate(b.view(torch.uint8), group_size=64, dim=1).view( + torch.float4_e2m1fn_x2 + ) + b_sf_unswizzled = unswizzle_sf(b_sf, interm_size * 2, hidden_size).view( + num_local_experts, interm_size * 2, hidden_size // sf_vec_size + ) + b_sf_unswizzled_interleaved = interleave_linear_and_gate(b_sf_unswizzled, group_size=64, dim=1) + b_sf_interleaved = swizzle_sf(b_sf_unswizzled_interleaved, interm_size * 2, hidden_size).view( + num_local_experts, interm_size * 2, hidden_size // sf_vec_size + ) + + # Compute reference: manually gather, compute GEMM, apply SwiGLU, then quantize + a_gathered = torch.empty( + max_num_permuted_tokens, hidden_size // 2, dtype=a.dtype, device=a.device + ) + a_sf_gathered = torch.empty( + max_num_permuted_tokens, hidden_size // sf_vec_size, dtype=a_sf.dtype, device=a_sf.device + ) + for i in range(num_valid_permuted_tokens): + expanded_idx = permuted_idx_to_expanded_idx[i].item() + if expanded_idx != helper.pad_val: + token_id = expanded_idx // top_k + a_gathered[i] = a[token_id] + a_sf_gathered[i] = a_sf_unswizzled[token_id] + + # Swizzle a_sf_gathered for reference GEMM + a_sf_gathered_swizzled = swizzle_sf( + a_sf_gathered.view(max_num_permuted_tokens, hidden_size // sf_vec_size), + max_num_permuted_tokens, + hidden_size, + ) + + c_ref = cute_dsl_nvfp4_grouped_gemm_ref( + a_gathered, + b, + a_sf_gathered_swizzled, + b_sf, + alpha, + tile_idx_to_group_idx, + num_non_exiting_tiles, + tile_size=tile_size, + output_dtype=torch.bfloat16, + scaling_vector_size=sf_vec_size, + ) + c_ref = swiglu_ref(c_ref) + global_sf = c_ref[:num_valid_permuted_tokens].abs().max().float() / (448 * 6) + c_ref, c_sf_ref = torch.ops.trtllm.fp4_quantize(c_ref, 1 / global_sf, sf_vec_size, False) + + # Call gather kernel + c, c_sf = torch.ops.trtllm.cute_dsl_nvfp4_gather_grouped_gemm_swiglu_blackwell( + a, + b_interleaved, + a_sf_unswizzled, + b_sf_interleaved, + alpha, + tile_idx_to_group_idx, + tile_idx_to_mn_limit, + permuted_idx_to_expanded_idx, + num_non_exiting_tiles, + torch.tensor([1 / global_sf], dtype=torch.float32, device="cuda"), + num_experts=num_experts, + top_k=top_k, + num_local_experts=num_local_experts, + local_expert_offset=0, + tile_size=tile_size, + scaling_vector_size=sf_vec_size, + ) + + # Verify output (only compare valid tokens, skip padding tokens where permuted_idx_to_expanded_idx == -1) + # Create mask for valid tokens + valid_token_mask = torch.zeros(num_valid_permuted_tokens, dtype=torch.bool, device="cuda") + for i in range(num_valid_permuted_tokens): + if permuted_idx_to_expanded_idx[i].item() != helper.pad_val: + valid_token_mask[i] = True + + num_valid_tokens = valid_token_mask.sum().item() + if num_valid_tokens > 0: + # Compare output values only for valid tokens + c_valid = c[:num_valid_permuted_tokens].view(torch.uint8)[valid_token_mask] + c_ref_valid = c_ref[:num_valid_permuted_tokens][valid_token_mask] + check_accuracy(c_valid, c_ref_valid, atol=1e-4, rtol=1e-4, percent=0.95) + + c_sf_unswizzled = unswizzle_sf(c_sf, max_num_permuted_tokens, interm_size, sf_vec_size) + c_sf_ref_unswizzled = unswizzle_sf( + c_sf_ref, max_num_permuted_tokens, interm_size, sf_vec_size + ) + + # Compare scale factors only for valid tokens + c_sf_valid = [] + c_sf_ref_valid = [] + for i in range(num_valid_permuted_tokens): + if permuted_idx_to_expanded_idx[i].item() != helper.pad_val: + c_sf_valid.append(c_sf_unswizzled[i]) + c_sf_ref_valid.append(c_sf_ref_unswizzled[i]) + + c_sf_valid = torch.cat(c_sf_valid) + c_sf_ref_valid = torch.cat(c_sf_ref_valid) + check_accuracy(c_sf_valid, c_sf_ref_valid, atol=1e-4, rtol=1e-4, percent=0.95) From 04a39a4e2bf4acd3eff9e071e76e4c4b11edcf00 Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:47:14 +0800 Subject: [PATCH 071/172] [None][chore] enable test_ipc.py (#9865) Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> --- .../integration/test_lists/test-db/l0_a10.yml | 1 + tests/unittest/executor/test_ipc.py | 94 +++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 0d7d4ee601..4f986b3e2d 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -78,6 +78,7 @@ l0_a10: - unittest/llmapi/test_additional_model_outputs.py -m "gpu1" # executor - unittest/executor/test_rpc.py + - unittest/executor/test_ipc.py # trtllm-serve CPU-only - unittest/llmapi/apps/test_chat_utils.py - unittest/llmapi/apps/test_tool_parsers.py diff --git a/tests/unittest/executor/test_ipc.py b/tests/unittest/executor/test_ipc.py index 0467769913..ebe3c57c43 100644 --- a/tests/unittest/executor/test_ipc.py +++ b/tests/unittest/executor/test_ipc.py @@ -538,6 +538,100 @@ class TestIpcAsyncBasics: client.close() server.close() + @pytest.mark.asyncio + async def test_async_router_without_hmac(self): + """Test async ROUTER socket without HMAC encryption.""" + server = ZeroMqQueue( + address=None, + socket_type=zmq.ROUTER, + is_server=True, + is_async=True, + name="async_router_server_no_hmac", + use_hmac_encryption=False, + ) + + client = ZeroMqQueue( + address=server.address, + socket_type=zmq.DEALER, + is_server=False, + is_async=True, + name="async_dealer_client_no_hmac", + use_hmac_encryption=False, + ) + + try: + # Client sends async request + request = {"async_request": "process_no_hmac"} + await client.put_async(request) + + # Server receives with identity + received = await server.get_async() + assert received == request + + # Server replies + response = {"async_response": "completed_no_hmac"} + await server.put_async(response) + + # Client receives + received = await client.get_async() + assert received == response + finally: + client.close() + server.close() + + @pytest.mark.asyncio + async def test_async_router_get_noblock(self): + """Test get_async_noblock on ROUTER socket (handling multipart).""" + server = ZeroMqQueue( + address=None, + socket_type=zmq.ROUTER, + is_server=True, + is_async=True, + name="async_router_noblock_server", + use_hmac_encryption=False, + ) + + client = ZeroMqQueue( + address=server.address, + socket_type=zmq.DEALER, + is_server=False, + is_async=True, + name="async_dealer_noblock_client", + use_hmac_encryption=False, + ) + + try: + # Client sends async request + request = {"noblock_request": "test"} + + # Send with small delay to ensure we test the polling/waiting + async def send_delayed(): + await asyncio.sleep(0.1) + await client.put_async(request) + + send_task = asyncio.create_task(send_delayed()) + + # Server receives using get_async_noblock + # This exercises the ROUTER specific recv_multipart path + received = await server.get_async_noblock(timeout=2.0) + assert received == request + + # Ensure identity was captured so we can reply + assert server._last_identity is not None + + # Server replies + response = {"noblock_response": "done"} + await server.put_async(response) + + # Client receives + received = await client.get_async() + assert received == response + + await send_task + finally: + client.close() + server.close() + class TestIpcPressureTest: """Test performance and load handling.""" From af2849cc7a336273de854b45ef7dc1e6be9f626a Mon Sep 17 00:00:00 2001 From: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> Date: Thu, 11 Dec 2025 18:04:48 +0800 Subject: [PATCH 072/172] [None][doc] Add DeepSeek-V3.2 to the supported models (#9893) Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> --- docs/source/models/supported-models.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md index c6b6194b5d..40f3840073 100644 --- a/docs/source/models/supported-models.md +++ b/docs/source/models/supported-models.md @@ -8,6 +8,7 @@ The following is a table of supported models for the PyTorch backend: | `BertForSequenceClassification` | BERT-based | `textattack/bert-base-uncased-yelp-polarity` | | `DeciLMForCausalLM` | Nemotron | `nvidia/Llama-3_1-Nemotron-51B-Instruct` | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3` | +| `DeepseekV32ForCausalLM` | DeepSeek-V3.2 | `deepseek-ai/DeepSeek-V3.2` | | `Exaone4ForCausalLM` | EXAONE 4.0 | `LGAI-EXAONE/EXAONE-4.0-32B` | | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it` | | `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b` | @@ -34,6 +35,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl | Model Architecture/Feature | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Sliding Window Attention | Logits Post Processor | Guided Decoding | | ------------------------------ | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | --- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ------------------------ | --------------------- | --------------- | | `DeepseekV3ForCausalLM` | Yes | Yes | Yes | Yes | Yes [^1] | Yes | No | No | Yes | Yes | Yes [^2] | N/A | Yes | Yes | +| `DeepseekV32ForCausalLM` | Yes | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | N/A | Yes | Yes | | `Qwen3MoeForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | N/A | Yes | Yes | | `Qwen3NextForCausalLM` | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested | | `Llama4ForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Untested | N/A | Yes | Yes | From 488d38f88d17936a3c54db81fce1f7e9cb80b99d Mon Sep 17 00:00:00 2001 From: xxi <95731198+xxi-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 00:22:13 +0800 Subject: [PATCH 073/172] [TRTLLM-8959][feat] ConfigurableMoE support CUTLASS (#9772) --- .../communication/nvlink_two_sided.py | 6 +- .../modules/fused_moe/configurable_moe.py | 128 ++++++--- .../_torch/modules/fused_moe/create_moe.py | 2 +- .../modules/fused_moe/fused_moe_cutlass.py | 257 +++++++++--------- .../defs/accuracy/test_llm_api_pytorch.py | 30 +- .../test_lists/test-db/l0_b200.yml | 7 + .../test_lists/test-db/l0_dgx_b200.yml | 6 - .../test-db/l0_gb200_multi_gpus.yml | 6 + .../unittest/_torch/modules/test_fused_moe.py | 51 ++-- 9 files changed, 284 insertions(+), 209 deletions(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_two_sided.py b/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_two_sided.py index c38cf3391e..61d03b3a97 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_two_sided.py +++ b/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_two_sided.py @@ -65,6 +65,10 @@ class NVLinkTwoSided(Communication): os.environ.get("TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1") == "1" ) + # Invalid token expert ID (default to -1), the kernels in TRTLLM-gen is hard-coded to support -1 only. + # CutlassFusedMoE kernels support any invalid value. + self.invalid_token_expert_id: int = -1 + # Initialize NVLINK workspaces MnnvlMemory.initialize() self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(mapping) @@ -168,7 +172,7 @@ class NVLinkTwoSided(Communication): alltoall_info.recv_rank_count_cumsum, all_rank_max_num_tokens, top_k, - self.num_slots, + self.invalid_token_expert_id, self.ep_size, ) diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py index c7df8e1f9a..e5bb52ad20 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py @@ -402,6 +402,11 @@ class ConfigurableMoE(MoE): 3. Execute MoE computation (single or multiple chunks) 4. Handle output truncation and EPLB repeat """ + # TODO: to clarify whether the output_dtype is needed. + if isinstance(x, Fp4QuantizedTensor): + assert output_dtype is not None + else: + output_dtype = x.dtype # ========== Step 1: Handle padding ========== if all_rank_num_tokens is None: all_rank_num_tokens = [x.shape[0]] @@ -662,7 +667,7 @@ class ConfigurableMoE(MoE): token_final_scales=token_final_scales, x_sf=x_sf, **self._get_backend_kwargs( - router_logits, do_finalize, all_rank_num_tokens, output_dtype + router_logits, do_finalize, all_rank_num_tokens, output_dtype, x ), ) @@ -875,12 +880,68 @@ class ConfigurableMoE(MoE): """Check if using NVLinkTwoSided communication strategy""" return isinstance(self.comm, NVLinkTwoSided) + def _get_nvlink_onesided_moe_output( + self, + all_rank_num_tokens: Optional[List[int]], + output_dtype: Optional[torch.dtype], + ) -> Optional[torch.Tensor]: + """ + Get workspace output buffer for NVLinkOneSided communication backend. + + This method handles moe_output allocation for both CutlassFusedMoE and TRTLLMGenFusedMoE + when using NVLinkOneSided communication strategy. + + Args: + all_rank_num_tokens: Token counts per rank + output_dtype: Output data type + + Returns: + moe_output tensor if NVLinkOneSided is used and backend supports it, None otherwise + """ + if not isinstance(self.comm, NVLinkOneSided): + return None + + # Determine workspace dtype and whether backend supports workspace output + workspace_dtype = output_dtype + backend_supports_workspace = False + + if isinstance(self.backend, TRTLLMGenFusedMoE): + # TRTLLMGen specific configuration + self.comm.invalid_token_expert_id = -1 + workspace_dtype = torch.bfloat16 + backend_supports_workspace = self.backend.has_w4a8_mxfp4_mxfp8 + elif isinstance(self.backend, CutlassFusedMoE): + # Cutlass always supports workspace output with NVLinkOneSided + backend_supports_workspace = True + + if not backend_supports_workspace: + # Ensure payload_in_workspace is False if backend doesn't support it + self.comm.payload_in_workspace = False + return None + + # Calculate runtime max tokens per rank + assert all_rank_num_tokens is not None, ( + "all_rank_num_tokens must be provided for NVLinkOneSided backend" + ) + runtime_max_tokens_per_rank = max(all_rank_num_tokens) + + # Get workspace-backed output tensor + moe_output = self.comm.get_combine_payload_tensor_in_workspace( + runtime_max_tokens_per_rank, self.hidden_size, workspace_dtype + ) + + # Dynamically enable payload_in_workspace for this forward pass + self.comm.payload_in_workspace = True + + return moe_output + def _get_backend_kwargs( self, router_logits: Optional[torch.Tensor] = None, do_finalize: bool = True, all_rank_num_tokens: Optional[List[int]] = None, output_dtype: Optional[torch.dtype] = None, + x: Optional[torch.Tensor] = None, ) -> Dict: """ Get backend-specific keyword arguments for run_moe @@ -905,6 +966,8 @@ class ConfigurableMoE(MoE): router_logits: Router logits tensor (for TRTLLMGen backend) do_finalize: Whether to finalize output (for TRTLLMGen backend) all_rank_num_tokens: Token counts per rank (for TRTLLMGen backend moe_output) + output_dtype: Output data type + x: Input tensor (for calculating tuner_num_tokens in Cutlass) Returns: Dict: Backend-specific keyword arguments @@ -917,7 +980,33 @@ class ConfigurableMoE(MoE): # Cutlass-specific parameters if self.backend.__class__ == CutlassFusedMoE: - pass + # Determine if scaling factors are swizzled based on communication flow + # In post-quant communication (quantize -> dispatch), scaling factors are not swizzled + # In pre-quant communication (dispatch -> quantize), scaling factors are swizzled + supports_post_quant = self.comm is not None and self.comm.supports_post_quant_dispatch() + kwargs["is_sf_swizzled"] = not supports_post_quant + kwargs["output_dtype"] = output_dtype + + # Prepare additional information for profiling in case padding is applied when using alltoall. + # Only the non-alltoall case is considered for profiling in the warmup phase. + # Therefore, to get the correct tactics during the actual inference, the inputs to the tuner + # should be the same as when not using alltoall. + if self._is_using_alltoall(): + if all_rank_num_tokens is not None: + kwargs["tuner_num_tokens"] = sum(all_rank_num_tokens) + else: + kwargs["tuner_num_tokens"] = ( + x.shape[0] * self.mapping.tp_size if x is not None else None + ) + kwargs["tuner_top_k"] = self.routing_method.top_k + else: + kwargs["tuner_num_tokens"] = None + kwargs["tuner_top_k"] = None + + # Get moe_output for NVLinkOneSided backend + kwargs["moe_output"] = self._get_nvlink_onesided_moe_output( + all_rank_num_tokens, output_dtype + ) # CuteDSL-specific parameters elif self.backend.__class__ == CuteDslFusedMoE: @@ -940,37 +1029,10 @@ class ConfigurableMoE(MoE): kwargs["router_logits"] = router_logits_arg kwargs["do_finalize"] = do_finalize - # moe_output: workspace output buffer for NVLINK one-sided backend - # TRTLLMGenFusedMoE only supports workspace output for w4a8_mxfp4_mxfp8 quantization. - moe_output = None - if isinstance(self.comm, NVLinkOneSided): - # Determine dtype for workspace tensor - # TRTLLMGenFusedMoE always uses bfloat16, other backends use output_dtype - workspace_dtype = output_dtype - if isinstance(self.backend, TRTLLMGenFusedMoE): - self.comm.invalid_token_expert_id = -1 - workspace_dtype = torch.bfloat16 - - # Check if backend supports workspace output for current quantization - backend_supports_workspace = ( - isinstance(self.backend, TRTLLMGenFusedMoE) - and self.backend.has_w4a8_mxfp4_mxfp8 - ) - if backend_supports_workspace: - assert all_rank_num_tokens is not None, ( - "all_rank_num_tokens must be provided for NVLinkOneSided backend with workspace output" - ) - runtime_max_tokens_per_rank = max(all_rank_num_tokens) - - moe_output = self.comm.get_combine_payload_tensor_in_workspace( - runtime_max_tokens_per_rank, self.hidden_size, workspace_dtype - ) - # Dynamically enable payload_in_workspace for this forward pass - self.comm.payload_in_workspace = True - else: - # Ensure payload_in_workspace is False for non-workspace output - self.comm.payload_in_workspace = False - kwargs["moe_output"] = moe_output + # Get moe_output for NVLinkOneSided backend + kwargs["moe_output"] = self._get_nvlink_onesided_moe_output( + all_rank_num_tokens, output_dtype + ) return kwargs diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py index 368ad0c07b..f921e25014 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py @@ -346,7 +346,7 @@ def create_moe( if ENABLE_CONFIGURABLE_MOE or moe_cls == CuteDslFusedMoE: # ConfigurableMoE only supports TRTLLMGenFusedMoE and CuteDslFusedMoE backends - if moe_cls in (TRTLLMGenFusedMoE, CuteDslFusedMoE): + if moe_cls in (TRTLLMGenFusedMoE, CuteDslFusedMoE, CutlassFusedMoE): return ConfigurableMoE( routing_method=routing_method, num_experts=num_experts, diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index c300243dff..534c89d104 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -12,7 +12,7 @@ from ...distributed import allgather from ...expert_statistic import ExpertStatistic from ...model_config import ModelConfig from ...utils import (ActivationType, AuxStreamType, EventType, - Fp4QuantizedTensor, ceil_div) + Fp4QuantizedTensor) from .interface import AlltoallMethodType, MoE from .quantization import UnquantizedFusedMoEMethod @@ -229,7 +229,7 @@ class CutlassFusedMoE(MoE): @property def has_int8_woq_per_channel(self): - return self.quant_config.layer_quant_mode.is_int8_weight_only( + return self.quant_config and self.quant_config.layer_quant_mode.is_int8_weight_only( ) and not self.quant_config.layer_quant_mode.has_per_group_scaling() def select_alltoall_method_type(self) -> AlltoallMethodType: @@ -270,16 +270,22 @@ class CutlassFusedMoE(MoE): def quantize_input( self, x: Union[torch.Tensor, Fp4QuantizedTensor], + post_quant_comm: bool = True, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ Quantize input tensor - CutlassFusedMoE implementation Handles all quantization cases for Cutlass backend. - """ - # Determine if this is post-quant communication scenario - run_post_quant_allgather = self.use_dp and self.parallel_size > 1 + Args: + x: Input tensor to quantize + post_quant_comm: Whether this is for post-quantization communication + (allgather or alltoall). If True, x_sf will be reshaped to 2D. + + Returns: + Tuple of (quantized_x, x_sf) + """ x_sf = None if self.has_any_quant: if self.has_fp8_qdq or self.has_w4a8_mxfp4_fp8: @@ -298,25 +304,40 @@ class CutlassFusedMoE(MoE): # No quantization needed here, handled in kernel pass elif self.has_nvfp4: - if run_post_quant_allgather or self.enable_alltoall: + if hasattr( + self, + 'fc31_act_scale') and self.fc31_act_scale is not None: + assert not isinstance( + x, Fp4QuantizedTensor + ), "Fp4QuantizedTensor is not expected for AWQ quantization." + x = x * self.fc31_act_scale + # Quantize based on communication scenario + if post_quant_comm: if isinstance(x, Fp4QuantizedTensor): assert not x.is_sf_swizzled, "Fp4QuantizedTensor should not be swizzled before communication" x, x_sf = x.fp4_tensor, x.scaling_factor + x_row = x.shape[0] else: + x_row = x.shape[0] x, x_sf = torch.ops.trtllm.fp4_quantize( x, self.fc31_input_scale, self.scaling_vector_size, False, False) - # Reshape x_sf to 2D - x_sf = x_sf.view((x.shape[0], -1)) + # Reshape x_sf to 2D for post-quant communication + if x_sf is not None: + x_sf = x_sf.view((x_row, -1)) else: if not isinstance(x, Fp4QuantizedTensor): x, x_sf = torch.ops.trtllm.fp4_quantize( x, self.fc31_input_scale, self.scaling_vector_size, False, True) elif self.has_w4a8_mxfp4_mxfp8: - if run_post_quant_allgather or self.enable_alltoall: + if post_quant_comm: x, x_sf = torch.ops.trtllm.mxfp8_quantize( x, False, alignment=self.quant_method.weight_alignment) + # Reshape x_sf to 2D for post-quant communication + # x.shape[0] is padded + if x_sf is not None: + x_sf = x_sf.view((x.shape[0], -1)) else: x, x_sf = torch.ops.trtllm.mxfp8_quantize( x, True, alignment=self.quant_method.weight_alignment) @@ -368,6 +389,89 @@ class CutlassFusedMoE(MoE): self._weights_created = True self._check_configs() + def run_moe( + self, + x: torch.Tensor, + token_selected_experts: torch.Tensor, + token_final_scales: torch.Tensor, + x_sf: Optional[torch.Tensor] = None, + is_sf_swizzled: bool = True, + output_dtype: Optional[torch.dtype] = None, + tuner_num_tokens: Optional[int] = None, + tuner_top_k: Optional[int] = None, + moe_output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Run MoE computation with Cutlass backend. + + This method encapsulates the core MoE computation logic, handling different + quantization schemes. + + Args: + x: Input hidden states (may be pre-quantized) + token_selected_experts: Expert IDs or expert slots [num_tokens, top_k] + If EPLB is enabled, represents expert slots; otherwise expert IDs + token_final_scales: Final scaling factors for each token + x_sf: Input scale factors (optional, for certain quantization schemes) + is_sf_swizzled: Whether scaling factors are swizzled + output_dtype: Output data type (optional) + tuner_num_tokens: Number of tokens for profiling tuner (optional) + tuner_top_k: Top-k value for profiling tuner (optional) + moe_output: Pre-allocated output buffer (optional) + + Returns: + final_hidden_states: Output tensor from MoE computation + """ + # Determine weight dtype based on quantization mode + weight_dtype = self.w3_w1_weight.dtype + if self.has_any_quant: + if self.has_w4afp8: + weight_dtype = torch.quint4x2 + elif self.has_w4a16_mxfp4: + weight_dtype = torch.uint8 + + final_hidden_states = torch.ops.trtllm.fused_moe( + x, + token_selected_experts, + token_final_scales, + self.w3_w1_weight.view(weight_dtype), + self.w3_w1_bias, + self.w2_weight.view(weight_dtype), + self.w2_bias, + output_dtype, + quant_scales=self.quant_scales, + input_sf=x_sf, + swizzled_input_sf=is_sf_swizzled, + swiglu_alpha=self.swiglu_alpha, + swiglu_beta=self.swiglu_beta, + swiglu_limit=self.swiglu_limit, + tp_size=self.tp_size, + tp_rank=self.tp_rank, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + cluster_size=self.cluster_size, + cluster_rank=self.cluster_rank, + enable_alltoall=self.enable_alltoall, + use_deepseek_fp8_block_scale=self.has_deepseek_fp8_block_scales, + use_w4_group_scaling=self.has_w4afp8 or self.has_w4a16_mxfp4, + use_int8_woq_per_channel=self.has_int8_woq_per_channel, + use_mxfp8_act_scaling=self.has_w4a8_mxfp4_mxfp8, + min_latency_mode=False, + use_fused_finalize=self.use_fused_finalize, + tune_max_num_tokens=self.tune_max_num_tokens, + tuner_num_tokens=tuner_num_tokens, + tuner_top_k=tuner_top_k, + activation_type=self.activation_type, + unpadded_hidden_size=self.unpadded_hidden_size, + out_tensor=moe_output, + ) + # Custom op requires all inputs are in the same type. + # Only in cutlass_min_latency_mode, the output is a list of tensors. + # Otherwise, the output should be unpacked as a single tensor. + final_hidden_states = final_hidden_states[0] + + return final_hidden_states + def forward_chunk( self, x: Union[torch.Tensor, Fp4QuantizedTensor], @@ -421,72 +525,11 @@ class CutlassFusedMoE(MoE): token_final_scales = None run_post_quant_allgather = self.use_dp and self.parallel_size > 1 - # quantize inputs - use_deepseek_fp8_block_scale = False - use_w4_group_scaling = False - use_int8_woq_per_channel = False - use_mxfp8_act_scaling = False - weight_dtype = self.w3_w1_weight.dtype - x_sf = None - x_row = x.shape[0] - x_col = x.shape[1] - if self.has_any_quant: - if self.has_fp8_qdq or self.has_w4a8_mxfp4_fp8: - x, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor( - x, self.fc31_input_dequant) - elif self.has_deepseek_fp8_block_scales: - use_deepseek_fp8_block_scale = True - elif self.has_w4afp8: - use_w4_group_scaling = True - weight_dtype = torch.quint4x2 - elif self.has_w4a16_mxfp4: - pad_size = self.hidden_size - x.shape[1] - x = torch.nn.functional.pad(x, (0, pad_size)) - use_w4_group_scaling = True - weight_dtype = torch.uint8 - elif self.has_int8_woq_per_channel: - use_int8_woq_per_channel = True - elif self.has_nvfp4: - # Apply pre_quant_scale if it exists (for NVFP4_AWQ) - if hasattr( - self, - 'fc31_act_scale') and self.fc31_act_scale is not None: - assert not isinstance( - x, Fp4QuantizedTensor - ), "Fp4QuantizedTensor is not expected for AWQ quantization." - x = x * self.fc31_act_scale - if run_post_quant_allgather or self.enable_alltoall: - if isinstance(x, Fp4QuantizedTensor): - assert not x.is_sf_swizzled, "Fp4QuantizedTensor should not be swizzled before communication" - x_row = x.shape[0] - # note: we use uint8 to store 2 fp4 values - x_col = x.shape[1] * 2 - x, x_sf = x.fp4_tensor, x.scaling_factor - else: - x_row = x.shape[0] - x_col = x.shape[1] - x, x_sf = torch.ops.trtllm.fp4_quantize( - x, self.fc31_input_scale, self.scaling_vector_size, - False, False) - else: - if not isinstance(x, Fp4QuantizedTensor): - x, x_sf = torch.ops.trtllm.fp4_quantize( - x, self.fc31_input_scale, self.scaling_vector_size, - False, True) - elif self.has_w4a8_mxfp4_mxfp8: - use_mxfp8_act_scaling = True - if run_post_quant_allgather or self.enable_alltoall: - x, x_sf = torch.ops.trtllm.mxfp8_quantize( - x, False, alignment=self.quant_method.weight_alignment) - else: - x, x_sf = torch.ops.trtllm.mxfp8_quantize( - x, True, alignment=self.quant_method.weight_alignment) - # Update x_row and x_col to the padded shape - x_row, x_col = x.shape[0], x.shape[1] - else: - raise ValueError( - f"unsupported quantization mode: {self.quant_config.quant_mode}" - ) + + # Quantize inputs using extracted method + # For post_quant_comm scenarios, x_sf will be reshaped to 2D inside quantize_input + post_quant_comm = run_post_quant_allgather or self.enable_alltoall + x, x_sf = self.quantize_input(x, post_quant_comm=post_quant_comm) # Prepare additional information for profiling in case padding is applied when using alltoall. # Only the non-alltoall case is considered for profiling in the warmup phase. @@ -535,11 +578,6 @@ class CutlassFusedMoE(MoE): self._load_balancer_update_statistic_with_gathered_statistic( gathered_loadbalancer_local_statistic_info) - if x_sf is not None: - x_sf = x_sf.view(x_row, - ceil_div(x_col, self.scaling_vector_size)) - is_sf_swizzled = False - # Dispatch x, x_sf, token_selected_slots, token_final_scales in one alltoall kernel x, x_sf, token_selected_slots, token_final_scales = MnnvlMoe.mnnvl_moe_alltoallv( [x, x_sf, token_selected_slots, token_final_scales], @@ -552,10 +590,6 @@ class CutlassFusedMoE(MoE): self.ep_size) elif self.alltoall_method_type == AlltoallMethodType.NVLinkOneSided: # Python MoeAlltoAll path - if x_sf is not None: - x_sf = x_sf.view(x_row, - ceil_div(x_col, self.scaling_vector_size)) - is_sf_swizzled = False payloads = [] payloads.append(x) @@ -593,20 +627,13 @@ class CutlassFusedMoE(MoE): elif run_post_quant_allgather: # Original allgather logic - if x_sf is not None: - x_sf = x_sf.view(x_row, ceil_div(x_col, - self.scaling_vector_size)) - assert len( - x_sf.shape - ) == 2, "The hidden states scaling factor should be 2D tensor before allgather" - is_sf_swizzled = False + # x_sf is already 2D after quantize_input with post_quant_comm=True x, x_sf, token_selected_slots, token_final_scales = allgather( [x, x_sf, token_selected_slots, token_final_scales], self.mapping, dim=0, sizes=None if use_dp_padding else all_rank_num_tokens) - x_row = x.shape[0] # Optionally provide an output tensor to fused_moe so it writes directly to our buffer moe_output: Optional[torch.Tensor] = None @@ -617,45 +644,19 @@ class CutlassFusedMoE(MoE): moe_output = self.moe_a2a.get_combine_payload_tensor_in_workspace( runtime_max_tokens_per_rank, self.unpadded_hidden_size, output_dtype) - final_hidden_states = torch.ops.trtllm.fused_moe( - x, - token_selected_slots, - token_final_scales, - self.w3_w1_weight.view(weight_dtype), - self.w3_w1_bias, - self.w2_weight.view(weight_dtype), - self.w2_bias, - output_dtype, - quant_scales=self.quant_scales, - input_sf=x_sf, - swizzled_input_sf=is_sf_swizzled, - swiglu_alpha=self.swiglu_alpha, - swiglu_beta=self.swiglu_beta, - swiglu_limit=self.swiglu_limit, - tp_size=self.tp_size, - tp_rank=self.tp_rank, - ep_size=self.ep_size, - ep_rank=self.ep_rank, - cluster_size=self.cluster_size, - cluster_rank=self.cluster_rank, - enable_alltoall=self.enable_alltoall, - use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, - use_w4_group_scaling=use_w4_group_scaling, - use_int8_woq_per_channel=use_int8_woq_per_channel, - use_mxfp8_act_scaling=use_mxfp8_act_scaling, - min_latency_mode=False, - use_fused_finalize=self.use_fused_finalize, - tune_max_num_tokens=self.tune_max_num_tokens, + + # Call extracted run_moe method + final_hidden_states = self.run_moe( + x=x, + token_selected_experts=token_selected_slots, + token_final_scales=token_final_scales, + x_sf=x_sf, + is_sf_swizzled=not post_quant_comm, + output_dtype=output_dtype, tuner_num_tokens=tuner_num_tokens, tuner_top_k=tuner_top_k, - activation_type=self.activation_type, - unpadded_hidden_size=self.unpadded_hidden_size, - out_tensor=moe_output, + moe_output=moe_output, ) - # Custom op requires all inputs are in the same type. - # Only in cutlass_min_latency_mode, the output is a list of tensors. - # Otherwise, the output should be unpacked as a single tensor. - final_hidden_states = final_hidden_states[0] self._load_balancer_start_set_cpu_stage(is_last_call) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 8cf33c5f12..dbc991eb49 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1879,13 +1879,17 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): torch_compile, mtp_nextn, moe_backend, enable_configurable_moe, mocker): # Handle ENABLE_CONFIGURABLE_MOE environment variable - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + if enable_configurable_moe == 1 and moe_backend not in [ + "TRTLLM", "CUTLASS" + ]: pytest.skip( - f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, " + f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, " f"current backend is {moe_backend}") # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + env_value = "1" if enable_configurable_moe == 1 and moe_backend in [ + "TRTLLM", "CUTLASS" + ] else "0" patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) @@ -3515,13 +3519,17 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): attention_dp, cuda_graph, overlap_scheduler, activation_dtype, enable_configurable_moe, mocker): # Handle ENABLE_CONFIGURABLE_MOE environment variable - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + if enable_configurable_moe == 1 and moe_backend not in [ + "TRTLLM", "CUTLASS" + ]: pytest.skip( - f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, " + f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, " f"current backend is {moe_backend}") # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + env_value = "1" if enable_configurable_moe == 1 and moe_backend in [ + "TRTLLM", "CUTLASS" + ] else "0" patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) @@ -3983,13 +3991,17 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): ep_size, attention_dp, cuda_graph, overlap_scheduler, enable_configurable_moe, mocker): # Handle ENABLE_CONFIGURABLE_MOE environment variable - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + if enable_configurable_moe == 1 and moe_backend not in [ + "TRTLLM", "CUTLASS" + ]: pytest.skip( - f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, " + f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, " f"current backend is {moe_backend}") # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + env_value = "1" if enable_configurable_moe == 1 and moe_backend in [ + "TRTLLM", "CUTLASS" + ] else "0" patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index fd04f2028e..3b59f51118 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -82,6 +82,13 @@ l0_b200: - unittest/tools/test_layer_wise_benchmarks.py::test_deepseek_r1_ctx_dep[1] - unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1] - unittest/_torch/modeling/test_modeling_exaone4.py::TestEXAONE4::test_llm_load_1_FP8 + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-TRTLLM-dtype1] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-CUTLASS-dtype1] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-CUTLASS] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-256-CUTLASS] # ------------- AutoDeploy tests --------------- - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index f0c2b3131c..89f6598da3 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -137,8 +137,6 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] @@ -172,16 +170,12 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-tp4-trtllm-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-ep4-trtllm-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index ef59faeb57..447e989f54 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -84,4 +84,10 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-tp4-trtllm-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-ep4-trtllm-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8] - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90) diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 1db2aab76a..fa58161896 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -1361,14 +1361,18 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method, if x == 0 else "enable_configurable_moe") def test_fused_moe_nvfp4(dtype, moe_backend, enable_configurable_moe, mocker): - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": - pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") + if enable_configurable_moe == 1 and moe_backend not in [ + "TRTLLM", "CUTLASS" + ]: + pytest.skip( + "ENABLE_CONFIGURABLE_MOE=1, only TRTLLM and CUTLASS backend are enabled" + ) mocker.patch.dict( os.environ, { "ENABLE_CONFIGURABLE_MOE": - "1" - if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + "1" if enable_configurable_moe == 1 + and moe_backend in ["TRTLLM", "CUTLASS"] else "0" }) if moe_backend == "TRTLLM" and dtype == torch.float16: @@ -1532,15 +1536,10 @@ def test_fused_moe_nvfp4(dtype, moe_backend, enable_configurable_moe, mocker): ids=lambda x: "" if x == 0 else "enable_configurable_moe") def test_fused_moe_w4a8_nvfp4_fp8(moe_backend, enable_configurable_moe, mocker): - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": - pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") - - mocker.patch.dict( - os.environ, { - "ENABLE_CONFIGURABLE_MOE": - "1" - if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" - }) + mocker.patch.dict(os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" if enable_configurable_moe == 1 else "0" + }) dtype = torch.bfloat16 mapping = Mapping() @@ -1962,15 +1961,10 @@ def test_fused_moe_w4afp8(dtype, weight_loading_mode): def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias, enable_configurable_moe, mocker): - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": - pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") - - mocker.patch.dict( - os.environ, { - "ENABLE_CONFIGURABLE_MOE": - "1" - if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" - }) + mocker.patch.dict(os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" if enable_configurable_moe == 1 else "0" + }) if moe_backend == "CUTLASS" and hidden_unpadded % 128 != 0: pytest.skip() @@ -2237,15 +2231,10 @@ def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias, def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend, enable_configurable_moe, mocker): - if enable_configurable_moe == 1 and moe_backend != "TRTLLM": - pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") - - mocker.patch.dict( - os.environ, { - "ENABLE_CONFIGURABLE_MOE": - "1" - if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" - }) + mocker.patch.dict(os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" if enable_configurable_moe == 1 else "0" + }) mapping = Mapping() mapping.rank = mpi_rank() From 02edb19f4302226da22fd38bd6446d73ccb5c8da Mon Sep 17 00:00:00 2001 From: JadoTu <107457950+JadoTu@users.noreply.github.com> Date: Fri, 12 Dec 2025 00:52:03 +0800 Subject: [PATCH 074/172] [None] [feat] add eos_token_id in generation_config to sampling params (#9514) Signed-off-by: jiant <107457950+JadoTu@users.noreply.github.com> --- tensorrt_llm/sampling_params.py | 40 ++++++++----------- .../apps/_test_trtllm_serve_top_logprobs.py | 2 +- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py index 57bebba45e..3aa2260cee 100644 --- a/tensorrt_llm/sampling_params.py +++ b/tensorrt_llm/sampling_params.py @@ -373,14 +373,6 @@ class SamplingParams: if self.end_id is None: self.end_id = tokenizer.eos_token_id self.pad_id = tokenizer.pad_token_id - # kimi_k2 model uses the eos_token_id in generation config - if ( - hf_model_config is not None - and hf_model_config.model_type == "kimi_k2" - and generation_config is not None - and isinstance(generation_config.eos_token_id, int) - ): - self.end_id = generation_config.eos_token_id if self.pad_id is None: self.pad_id = self.end_id @@ -400,24 +392,26 @@ class SamplingParams: strs = [self.stop] if isinstance(self.stop, str) else self.stop self._stop_word_ids = [_encode(tokenizer, s, add_special_tokens) for s in strs] - # add generation_config to stop word list, only in qwen3-next now - if ( - hf_model_config is not None - and hf_model_config.model_type == "qwen3_next" - and generation_config is not None - and isinstance(generation_config.eos_token_id, List) - and all(isinstance(i, int) for i in generation_config.eos_token_id) - ): - if self._stop_word_ids: + # Add eos_token_id in generation_config to _stop_word_ids + # Refer to https://huggingface.co/docs/hub/en/transformers#transformers-repository-files and + # https://github.com/huggingface/transformers/blob/1ae4d917ed3badbdb1ffc167e0529f5a6d3c080d/src/transformers/generation/stopping_criteria.py#L451C1-L451C42 + # The eos_token_id in generation_config are really mean to stop the text generation. + if generation_config is not None and generation_config.eos_token_id is not None: + if isinstance(generation_config.eos_token_id, int): + generation_eos_token_ids = [generation_config.eos_token_id] + else: # always List[int] + generation_eos_token_ids = generation_config.eos_token_id + + if self._stop_word_ids is None: + self._stop_word_ids = [generation_eos_token_ids] + else: all_stop_tokens_id = set(i for sublist in self._stop_word_ids for i in sublist) - from_generation_stop_tokens = [ - i for i in generation_config.eos_token_id if i not in all_stop_tokens_id + from_generation_stop_token_ids = [ + i for i in generation_eos_token_ids if i not in all_stop_tokens_id ] - if from_generation_stop_tokens: - self._stop_word_ids.append(from_generation_stop_tokens) - else: - self._stop_word_ids = [generation_config.eos_token_id] + if from_generation_stop_token_ids: + self._stop_word_ids.append(from_generation_stop_token_ids) return self diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py index d287e5e35e..c7a4fc7f16 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py @@ -110,7 +110,7 @@ async def test_chat_completion_top1_logprobs(async_client: openai.AsyncOpenAI, "content": "You are a helpful assistant." }, { "role": "user", - "content": "What is the capital of France?" + "content": "What is the capital of France? please in detail." }] # Test top_logprobs=1 chat_completion = await async_client.chat.completions.create( From 89dabf5aa125384b5eb6079088a9b5698d50967b Mon Sep 17 00:00:00 2001 From: Erin <14718778+hchings@users.noreply.github.com> Date: Thu, 11 Dec 2025 09:33:25 -0800 Subject: [PATCH 075/172] [TRTLLM-9736][feat] AsyncLLM and verl integ (#9353) Signed-off-by: Liwei Ma Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> Signed-off-by: Erin Ho <14718778+hchings@users.noreply.github.com> Co-authored-by: Liwei Ma Co-authored-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> Co-authored-by: Superjomn <328693+Superjomn@users.noreply.github.com> --- tensorrt_llm/__init__.py | 3 +- tensorrt_llm/_torch/async_llm.py | 106 ++++++++++++ tensorrt_llm/_torch/pyexecutor/sampler.py | 45 ++--- tensorrt_llm/_torch/virtual_memory.py | 3 +- tensorrt_llm/executor/ray_executor.py | 158 ++++++++++++++---- tensorrt_llm/executor/ray_gpu_worker.py | 4 +- tensorrt_llm/llmapi/__init__.py | 2 + tensorrt_llm/llmapi/llm.py | 3 +- tensorrt_llm/llmapi/llm_args.py | 81 +++++++++ tensorrt_llm/llmapi/rlhf_utils.py | 16 +- tensorrt_llm/serve/openai_protocol.py | 10 ++ tensorrt_llm/serve/openai_server.py | 41 ++++- .../test_lists/test-db/l0_dgx_h100.yml | 17 ++ .../test_lists/test-db/l0_h100.yml | 1 + .../multi_gpu/test_executor.py | 68 ++++++-- .../api_stability/references/llm.yaml | 4 + tests/unittest/llmapi/test_async_llm.py | 137 +++++++++++++++ 17 files changed, 629 insertions(+), 70 deletions(-) create mode 100644 tensorrt_llm/_torch/async_llm.py create mode 100644 tests/unittest/llmapi/test_async_llm.py diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py index 978cf0796f..cea56431b7 100644 --- a/tensorrt_llm/__init__.py +++ b/tensorrt_llm/__init__.py @@ -84,7 +84,7 @@ from ._utils import (default_gpus_per_node, local_mpi_rank, local_mpi_size, from .builder import BuildConfig, Builder, BuilderConfig, build from .disaggregated_params import DisaggregatedParams from .functional import Tensor, constant -from .llmapi import LLM, MultimodalEncoder +from .llmapi import LLM, AsyncLLM, MultimodalEncoder from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs from .logger import logger from .mapping import Mapping @@ -136,6 +136,7 @@ __all__ = [ 'quantization', 'tools', 'LLM', + 'AsyncLLM', 'MultimodalEncoder', 'LlmArgs', 'TorchLlmArgs', diff --git a/tensorrt_llm/_torch/async_llm.py b/tensorrt_llm/_torch/async_llm.py new file mode 100644 index 0000000000..76c33220da --- /dev/null +++ b/tensorrt_llm/_torch/async_llm.py @@ -0,0 +1,106 @@ +from typing import Any, List, Optional + +from ..llmapi.llm import LLM +from ..llmapi.llm_args import RayPlacementConfig + + +class AsyncLLM(LLM): + """AsyncLLM is a subclass of LLM that supports asynchronous setup, release and + resume operations that are necessary for RL or agentic scenarios. + + Currently, RL APIs are only supported with Ray orchestrator. + """ + + def __init__( + self, + placement_groups: Optional[List[Any]] = None, + placement_bundle_indices: Optional[List[List[int]]] = None, + per_worker_gpu_share: Optional[float] = None, + *args, + **kwargs, + ): + kwargs["orchestrator_type"] = "ray" + kwargs["ray_placement_config"] = RayPlacementConfig( + defer_workers_init=True, + placement_groups=placement_groups, + placement_bundle_indices=placement_bundle_indices, + per_worker_gpu_share=per_worker_gpu_share, + ) + + # WAR: RL integration needs to use NCCL AllReduce for TP>1 due to a bug in TRTLLM's AllReduce + # which will cause convergence issue when using multiple rollout instances. + kwargs["allreduce_strategy"] = "NCCL" + + if "ray_worker_extension_cls" not in kwargs: + kwargs["ray_worker_extension_cls"] = "tensorrt_llm.llmapi.rlhf_utils.WorkerExtension" + + super().__init__(*args, **kwargs) + self._async_initialized = False + + async def setup_async(self): + """Setup the LLM asynchronously.""" + if not self._async_initialized: + await self._executor.init_workers_async() + await self._executor.setup_engine_remote_async() + self._async_initialized = True + return self + + async def release(self, tags: list[str]): + """Release the GPU memory used by the LLM asynchronously. + + Args: + tags: List of memory tag strings to release (e.g., ["model", "kv_cache"]). + """ + await self.collective_rpc("sleep", args=(tags,)) + + async def resume(self, tags: list[str]): + """Resume the GPU memory used by the LLM asynchronously. + + Args: + tags: List of memory tag strings to resume (e.g., ["model", "kv_cache"]). + """ + await self.collective_rpc("wakeup", args=(tags,)) + + async def update_weights(self, weights: dict[str, str]): + """Update the weights of the LLM asynchronously. + + + Args: + weights: Dictionary mapping device UUIDs to IPC handles for weight tensors. + """ + await self.collective_rpc("update_weights", args=(weights,)) + + async def collective_rpc( + self, + method: str, + args: tuple[Any, ...] = (), + kwargs: Optional[dict] = None, + unique_reply_rank: Optional[int] = None, + ) -> list[Any]: + """Execute an asynchronous RPC call on all GPU workers. Currently, this is only supported for RayExecutor. + + Args: + method (str): The name of the worker method to execute. + args (tuple[Any, ...]): Positional arguments to pass to the worker method. Defaults to (). + kwargs (dict, optional): Keyword arguments to pass to the worker method. Defaults to None. + unique_reply_rank (int, optional): The rank of the worker that will be used to send the reply. + + Returns: + list[Any]: A list of results from each worker. + """ + return await self._executor.collective_rpc_async( + method, args, kwargs, unique_reply_rank=unique_reply_rank + ) + + def __await__(self): + return self.setup_async().__await__() + + def __enter__(self): + raise RuntimeError("Please use 'async with AsyncLLM' instead") + + async def __aenter__(self): + await self.setup_async() + return super().__enter__() + + async def __aexit__(self, exc_type, exc_val, exc_tb): + return super().__exit__(exc_type, exc_val, exc_tb) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 11519f6aa5..40d1450e45 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -3010,7 +3010,7 @@ class TRTLLMSampler(Sampler, AsyncWorkerMixin): new_tokens_host = state.host.new_tokens.flatten().tolist() sequence_lengths_host_data = state.host.sequence_lengths.flatten().tolist() finish_reasons = state.host.finish_reasons.flatten().tolist() - log_probs_host = state.host.log_probs.tolist() if state.host.log_probs is not None else None + log_probs_host_tensor = state.host.log_probs cum_log_probs_host = ( state.host.cum_log_probs.tolist() if state.host.cum_log_probs is not None else None ) @@ -3032,24 +3032,31 @@ class TRTLLMSampler(Sampler, AsyncWorkerMixin): add_new_tokens_to_requests(reqs_with_new_tokens, new_tokens, 0) # Log probs - for request in reqs_with_new_tokens: - if request.py_return_log_probs: - seq_slot = request.py_seq_slot - seq_len = sequence_lengths_host_data[seq_slot] - begin_log_probs_offset = request.prompt_len - current_token = seq_len - request.prompt_len - 1 - log_probs = [ - { - new_tokens_host[seq_slot]: Logprob( - logprob=log_probs_host[seq_slot][0][ - begin_log_probs_offset + current_token - ], - rank=1, - ) - } - ] - cum_log_probs = [cum_log_probs_host[seq_slot]] - request.py_result.append_log_probs([log_probs], cum_log_probs) + if log_probs_host_tensor is not None: + # Log probs + seq_slots = [] + seq_lens = [] + for request in reqs_with_new_tokens: + if request.py_return_log_probs: + seq_slot = request.py_seq_slot + seq_slots.append(seq_slot) + seq_lens.append(sequence_lengths_host_data[seq_slot] - 1) + + log_probs_host = log_probs_host_tensor[seq_slots, 0, seq_lens].tolist() + idx = 0 + for request in reqs_with_new_tokens: + if request.py_return_log_probs: + log_probs = [ + { + new_tokens_host[seq_slot]: Logprob( + logprob=log_probs_host[idx], + rank=1, + ) + } + ] + cum_log_probs = [cum_log_probs_host[seq_slot]] + request.py_result.append_log_probs([log_probs], cum_log_probs) + idx += 1 for request in reqs: request.py_decoding_iter += 1 diff --git a/tensorrt_llm/_torch/virtual_memory.py b/tensorrt_llm/_torch/virtual_memory.py index 3702d73253..7efdd60c35 100644 --- a/tensorrt_llm/_torch/virtual_memory.py +++ b/tensorrt_llm/_torch/virtual_memory.py @@ -74,7 +74,8 @@ class ExecutorMemoryType(StrEnum): SPEC_RESOURCES = "spec_resource_manager" INIT_KV_CACHE = "_no_capture_init_kv_cache" INIT_EXTRA_RESOURCES = "_no_capture_init_extra_resources" - MODEL_EXTRA = "_no_capture_model_extra" # TODO: remove _no_capture after torch fix crash on torch.cuda.empty_cache() + # MODEL_EXTRA = "_no_capture_model_extra" # TODO: remove _no_capture after torch fix crash on torch.cuda.empty_cache() + MODEL_EXTRA = "model_extra" EXTRA_RESOURCES = "executor_extra" KV_CACHE = "kv_cache" MODEL_ENGINE_MAIN = "model" diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py index 579aac0a71..e03f524bea 100644 --- a/tensorrt_llm/executor/ray_executor.py +++ b/tensorrt_llm/executor/ray_executor.py @@ -1,3 +1,4 @@ +import asyncio import os from typing import Any, Dict, List, Optional, Tuple @@ -7,8 +8,7 @@ except ModuleNotFoundError as e: e.msg = """Cannot import Ray. Please install 'ray' package to use ray orchestrator""" raise -from ray.util.placement_group import (PlacementGroup, - PlacementGroupSchedulingStrategy, +from ray.util.placement_group import (PlacementGroupSchedulingStrategy, get_current_placement_group, placement_group) @@ -23,6 +23,7 @@ from .ray_gpu_worker import RayGPUWorker, RayWorkerWrapper from .request import GenerationRequest from .result import GenerationResult from .rpc_proxy_mixin import RpcExecutorMixin +from .utils import has_event_loop __all__ = [ "RayExecutor", @@ -77,19 +78,30 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): self.master_address = ray.util.get_node_ip_address() self.master_port = get_free_port() - worker_kwargs = dict(**worker_kwargs, - postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor) + self.worker_kwargs = dict( + **worker_kwargs, + postproc_worker_config=postproc_worker_config, + is_llm_executor=is_llm_executor) self.init_rpc_executor() # Inject the generated HMAC key into worker_kwargs for workers - worker_kwargs['hmac_key'] = self.hmac_key - worker_kwargs['rpc_addr'] = self.rpc_addr - self.create_workers(RayGPUWorker, worker_kwargs) - self.setup_engine_remote() - self.setup_mainloop(tasks=[self._fetch_responses_loop_async], - thread_name="ray_executor_main_loop") - logger.info(f"Connecting to RPC server at {self.rpc_addr}") + self.worker_kwargs['hmac_key'] = self.hmac_key + self.worker_kwargs['rpc_addr'] = self.rpc_addr + + placement_config = getattr(self.worker_kwargs['llm_args'], + 'ray_placement_config', None) + defer_workers_init = placement_config.defer_workers_init if placement_config else False + + if defer_workers_init: + self.workers = [ + ] # Placeholder, will be initialized in setup_async + self._mainloop_started = False # DO NOT start mainloop until after setup_engine_remote_async is called + else: + if not has_event_loop(): + self.init_workers_sync() + self.setup_engine_remote() + self.setup_mainloop(tasks=[self._fetch_responses_loop_async], + thread_name="ray_executor_main_loop") except Exception as e: self.shutdown() @@ -97,9 +109,16 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): raise e def create_workers(self, worker_cls, worker_kwargs): + llm_args = worker_kwargs.get("llm_args") + placement_config = getattr(llm_args, 'ray_placement_config', + None) if llm_args else None + # When set to be a fraction, it allows Ray to schedule # multiple actors on a single GPU for colocate use cases. num_gpus = float(os.getenv("TRTLLM_RAY_PER_WORKER_GPUS", "1.0")) + if placement_config and placement_config.per_worker_gpu_share is not None: + num_gpus = placement_config.per_worker_gpu_share + logger.debug(f"{num_gpus=} for each worker.") runtime_env = ray.runtime_env.RuntimeEnv() @@ -110,28 +129,40 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): "MASTER_PORT": str(self.master_port) }) - self.placement_group, self.bundle_indices = self._get_placement_group( - tp_size=self.tp_size) + placement_groups, self.bundle_indices = self._get_placement_group( + tp_size=self.tp_size, worker_kwargs=worker_kwargs) - self.workers = [ - RayWorkerWrapper.options( + if isinstance(placement_groups, list): + self.placement_group = None + else: + self.placement_group = placement_groups + + self.workers = [] + for rank in range(self.world_size): + pg = placement_groups[rank] if isinstance( + placement_groups, list) else placement_groups + worker = RayWorkerWrapper.options( num_gpus=num_gpus, - runtime_env=runtime_env, # per-actor env + runtime_env=runtime_env, scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=self.placement_group, + placement_group=pg, placement_group_bundle_index=self.bundle_indices[rank], )).remote(worker_cls, worker_kwargs, self.world_size, rank) - for rank in range(self.world_size) - ] + self.workers.append(worker) + def init_workers_sync(self): + self.create_workers(RayGPUWorker, self.worker_kwargs) try: - ray.get([worker.__ray_ready__.remote() for worker in self.workers]) + ray.get(self._get_worker_ready_futures()) except ray.exceptions.ActorDiedError as e: - if "The actor died because of an error raised in its creation task" in str( - e): - raise RuntimeError( - "RayGPUWorker died during initialization") from e - raise + raise RuntimeError("RayGPUWorker died during initialization") from e + + async def init_workers_async(self): + self.create_workers(RayGPUWorker, self.worker_kwargs) + try: + await asyncio.gather(*self._get_worker_ready_futures()) + except ray.exceptions.ActorDiedError as e: + raise RuntimeError("RayGPUWorker died during initialization") from e @unwrap_ray_errors() def call_all_ray_workers(self, func: str, leader_only: bool, @@ -171,6 +202,20 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): **kwargs)) return refs if non_block else ray.get(refs) + @unwrap_ray_errors() + async def collective_rpc_async( + self, + method: str, + args: tuple = (), + kwargs: Optional[dict] = None, + unique_reply_rank: Optional[int] = None) -> list[Any]: + refs = self.collective_rpc(method, + args, + kwargs, + non_block=True, + unique_reply_rank=unique_reply_rank) + return await asyncio.gather(*refs) + def submit(self, request: "GenerationRequest") -> "GenerationResult": """ Low-level API to the executor. Return a "future" GenerationResult @@ -198,6 +243,26 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): def setup_engine_remote(self): return self.collective_rpc("setup_engine", non_block=False) + async def setup_engine_remote_async(self): + """Async version of setup_engine_remote for use after async worker initialization.""" + if not self.workers or len(self.workers) == 0: + raise RuntimeError( + "Workers must be initialized before calling setup_engine_remote_async" + ) + + # Setup engine on all workers + result = await self.collective_rpc_async("setup_engine") + logger.info("setup_engine_remote_async finished") + + # Now that engine is set up, start the mainloop for fetching responses + if hasattr(self, '_mainloop_started') and not self._mainloop_started: + logger.info("Starting mainloop after engine setup") + self.setup_mainloop(tasks=[self._fetch_responses_loop_async], + thread_name="ray_executor_main_loop") + self._mainloop_started = True + + return result + def report_device_ids(self) -> list[str]: gpu_ids = self.call_all_ray_workers("report_device_id", leader_only=False, @@ -265,15 +330,52 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): logger.debug("Shutting down Ray cluster") ray.shutdown() - def _get_placement_group(self, - tp_size: int) -> Tuple[PlacementGroup, List[int]]: + def _get_worker_ready_futures(self): + return [worker.__ray_ready__.remote() for worker in self.workers] + + def _get_placement_group( + self, + tp_size: int, + worker_kwargs: Dict = None) -> Tuple[Any, List[int]]: """ Either use the existing placement group from driver script (e.g., in the case of RL FW integration), or create a default PACK placement group where each bundle has tp_size GPUs. - When tp_size ≤ GPUs per node, keep one TP group per node. - When tp_size > GPUs per node, allow a TP group span nodes. - rank 0 must be put on the driver node + + Returns: + Tuple of (placement_group(s), bundle_indices) + - placement_group(s) can be a single PlacementGroup or a List[PlacementGroup] + - bundle_indices is always a List[int] """ + llm_args = worker_kwargs.get("llm_args") if worker_kwargs else None + + placement_config = getattr(llm_args, 'ray_placement_config', + None) if llm_args else None + if placement_config and placement_config.placement_groups is not None: + total_workers = sum( + len(indices) + for indices in placement_config.placement_bundle_indices) + if total_workers != self.world_size: + raise ValueError( + f"Total bundle indices ({total_workers}) must equal world_size ({self.world_size})" + ) + + logger.info( + f"Creating {self.world_size} workers with external placement groups" + ) + + flat_pgs = [] + flat_indices = [] + for pg, indices in zip(placement_config.placement_groups, + placement_config.placement_bundle_indices): + for idx in indices: + flat_pgs.append(pg) + flat_indices.append(idx) + + return flat_pgs, flat_indices + bundle_indices = os.getenv("TRTLLM_RAY_BUNDLE_INDICES", None) if bundle_indices: diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py index 48f036abeb..fca5386cb5 100644 --- a/tensorrt_llm/executor/ray_gpu_worker.py +++ b/tensorrt_llm/executor/ray_gpu_worker.py @@ -1,3 +1,4 @@ +import gc import importlib import os from pathlib import Path @@ -43,7 +44,6 @@ class RayWorkerWrapper: def __init__(self, worker_cls, worker_kwargs, world_size, rank): self.master_address = os.environ["MASTER_ADDR"] self.master_port = os.environ["MASTER_PORT"] - # Ray can't pickle TensorRT logger global logger from tensorrt_llm.logger import logger @@ -218,6 +218,8 @@ class RayGPUWorker(RpcWorkerMixin, BaseWorker): torch.cuda.synchronize() release_with_tag(*tags) torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() except Exception as e: logger.error(f"Encountered an error in sleep: {e}") raise e diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py index cb868d8d06..8563b9090c 100644 --- a/tensorrt_llm/llmapi/__init__.py +++ b/tensorrt_llm/llmapi/__init__.py @@ -1,3 +1,4 @@ +from .._torch.async_llm import AsyncLLM from ..disaggregated_params import DisaggregatedParams from ..executor import CompletionOutput, LoRARequest, RequestError from ..sampling_params import GuidedDecodingParams, SamplingParams @@ -23,6 +24,7 @@ from .mpi_session import MpiCommSession __all__ = [ 'LLM', + 'AsyncLLM', 'MultimodalEncoder', 'CompletionOutput', 'RequestOutput', diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 41c9bdeeae..33774f0ed8 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -193,7 +193,7 @@ class BaseLLM: self.mpi_session = self.args.mpi_session if self.args.parallel_config.is_multi_gpu: - if get_device_count( + if os.getenv("RAY_LOCAL_WORLD_SIZE") is None and get_device_count( ) < self.args.parallel_config.world_size_per_node: raise RuntimeError( f"Only {get_device_count()} GPUs are available, but {self.args.parallel_config.world_size} are required." @@ -229,7 +229,6 @@ class BaseLLM: self.runtime_context: Optional[_ModelRuntimeContext] = None self.llm_build_stats = LlmBuildStats() - self._build_model() except Exception: diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 8627c883b9..9b32dbe91e 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -19,6 +19,11 @@ from pydantic import PrivateAttr, field_validator, model_validator from strenum import StrEnum from transformers import PreTrainedTokenizerBase +try: + from ray.util.placement_group import PlacementGroup +except ImportError: + PlacementGroup = None + from tensorrt_llm.lora_helper import (LoraConfig, get_default_trtllm_modules_to_hf_modules) @@ -1086,6 +1091,65 @@ class AutoDecodingConfig(DecodingBaseConfig): return backend == "pytorch" +class RayPlacementConfig(StrictBaseModel): + """ + Configuration for Ray GPU workers placement. + This config is only used with AsyncLLM for RL scenarios. + """ + defer_workers_init: bool = Field( + default=False, + description="Defer Ray worker initialization until async setup.") + + placement_groups: Optional[List[Any]] = Field( + default=None, + description="List of Ray placement groups, one per node. " + "Each element must be a ray.util.placement_group.PlacementGroup instance." + ) + + placement_bundle_indices: Optional[List[List[int]]] = Field( + default=None, + description="List of bundle indices for each placement group. " + "Outer list corresponds to placement_groups, inner list contains bundle indices for that group." + ) + + per_worker_gpu_share: Optional[float] = Field( + default=None, + description="GPU fraction per worker for colocation scenarios. " + "Example: 0.1 means 10 actors can share one GPU. Defaults to 1.0 (one actor per GPU)." + ) + + @model_validator(mode='after') + def validate_ray_placement(self) -> 'RayPlacementConfig': + has_pgs = self.placement_groups is not None + has_indices = self.placement_bundle_indices is not None + + if has_pgs != has_indices: + raise ValueError( + "placement_groups and placement_bundle_indices must be provided together" + ) + + if has_pgs: + if len(self.placement_groups) != len(self.placement_bundle_indices): + raise ValueError( + f"placement_groups length ({len(self.placement_groups)}) must equal " + f"placement_bundle_indices length ({len(self.placement_bundle_indices)})" + ) + if PlacementGroup is not None: + for i, pg in enumerate(self.placement_groups): + if not isinstance(pg, PlacementGroup): + raise TypeError( + f"placement_groups[{i}] must be a Ray PlacementGroup, " + f"got {type(pg).__name__}") + + if self.per_worker_gpu_share is not None: + if not (0 < self.per_worker_gpu_share <= 1.0): + raise ValueError( + f"per_worker_gpu_share must be between 0 and 1.0, " + f"got {self.per_worker_gpu_share}") + + return self + + class PybindMirror(ABC): ''' A class containing the utilities for mirroring Python classes to pybinding classes. @@ -2032,6 +2096,8 @@ class BaseLlmArgs(StrictBaseModel): @field_validator("gpus_per_node", mode='before') @classmethod def validate_gpus_per_node(cls, v, info): + if os.getenv("RAY_LOCAL_WORLD_SIZE") is not None: + return info.data.get("tensor_parallel_size") if v is None: logger.warning( f"Using default gpus_per_node: {torch.cuda.device_count()}") @@ -2750,6 +2816,13 @@ class TorchLlmArgs(BaseLlmArgs): "Allows users to extend the functions of the RayGPUWorker class.", status="prototype") + ray_placement_config: Optional[RayPlacementConfig] = Field( + default=None, + description= + "Placement config for RayGPUWorker. Only used with AsyncLLM and orchestrator_type='ray'.", + exclude=True, + status="prototype") + enable_sleep: bool = Field( default=False, description= @@ -3059,6 +3132,14 @@ class TorchLlmArgs(BaseLlmArgs): ) return self + @model_validator(mode='after') + def validate_ray_placement_config(self) -> 'TorchLlmArgs': + if self.ray_placement_config is not None and self.orchestrator_type != "ray": + raise ValueError( + "ray_placement_config is only supported with orchestrator_type='ray'" + ) + return self + def get_executor_config( self, _hf_model_dir: Optional[Path] = None, diff --git a/tensorrt_llm/llmapi/rlhf_utils.py b/tensorrt_llm/llmapi/rlhf_utils.py index 4934d40e97..ce6eaa5b4f 100644 --- a/tensorrt_llm/llmapi/rlhf_utils.py +++ b/tensorrt_llm/llmapi/rlhf_utils.py @@ -1,3 +1,5 @@ +import base64 +import pickle # nosec B403 from typing import Optional import torch @@ -56,12 +58,20 @@ class WorkerExtension: raise ValueError(f"Device UUID {device_uuid} not found in ipc_handles") weights = {} - all_handles = ipc_handles[device_uuid] + + serialized_handles = ipc_handles[device_uuid] + if isinstance(serialized_handles, str): + # Data is base64-encoded pickled bytes - deserialize it + logger.info("Deserializing base64-encoded weight handles") + all_handles = pickle.loads(base64.b64decode(serialized_handles)) # nosec B301 + else: + # Data is already in the correct format (backward compatibility) + all_handles = serialized_handles for param_name, tensor_handle in all_handles: func, args = tensor_handle list_args = list(args) - list_args[6] = self.device_id # Set target device + list_args[6] = self.device_id tensor = func(*list_args) weights[param_name] = tensor @@ -88,7 +98,7 @@ class WorkerExtension: logger.error("Encountered an error in update_weights") raise e - def check_weights_updated(self): + def check_weights_updated(self) -> bool: """Check if the weights are updated to 0.""" weights_updated = True for name, p in self.engine.model_engine.model.named_parameters(): diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py index 9dc837810e..8ddda27cd7 100644 --- a/tensorrt_llm/serve/openai_protocol.py +++ b/tensorrt_llm/serve/openai_protocol.py @@ -968,6 +968,16 @@ class ResponsesStreamResponse(OpenAIBaseModel): "response.incomplete"] +class MemoryUpdateRequest(OpenAIBaseModel): + tags: List[str] = Field(default=["model", "kv_cache"]) + + +class UpdateWeightsRequest(OpenAIBaseModel): + weights: Optional[Dict[str, str]] = Field( + default=None, + description="Weight handles dict, or None to finalize update") + + def encode_opaque_state(opaque_state: Optional[bytes]) -> Optional[str]: if opaque_state is None: return None diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index e64c5d20df..3811c8a12e 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -21,6 +21,7 @@ from starlette.routing import Mount from transformers import AutoProcessor from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm._torch.async_llm import AsyncLLM # yapf: disable from tensorrt_llm.executor import CppExecutorError from tensorrt_llm.executor.postproc_worker import PostprocParams @@ -46,9 +47,11 @@ from tensorrt_llm.serve.openai_protocol import (ChatCompletionRequest, ChatMessage, CompletionRequest, CompletionResponse, CompletionResponseChoice, - ErrorResponse, ModelCard, + ErrorResponse, + MemoryUpdateRequest, ModelCard, ModelList, PromptTokensDetails, - ResponsesRequest, UsageInfo, + ResponsesRequest, + UpdateWeightsRequest, UsageInfo, to_llm_disaggregated_params) from tensorrt_llm.serve.postprocess_handlers import ( ChatCompletionPostprocArgs, ChatPostprocArgs, CompletionPostprocArgs, @@ -262,6 +265,16 @@ class OpenAIServer: self.app.add_api_route("/v1/responses", self.openai_responses, methods=["POST"]) + # RL-only endpoints + self.app.add_api_route("/release_memory", + self.release_memory, + methods=["POST"]) + self.app.add_api_route("/resume_memory", + self.resume_memory, + methods=["POST"]) + self.app.add_api_route("/update_weights", + self.update_weights, + methods=["POST"]) if self.llm.args.return_perf_metrics: # register /prometheus/metrics self.mount_metrics() @@ -298,6 +311,16 @@ class OpenAIServer: self.app.add_api_route("/v1/chat/completions", self.openai_mm_encoder, methods=["POST"]) + # RL-only endpoints + self.app.add_api_route("/release_memory", + self.release_memory, + methods=["POST"]) + self.app.add_api_route("/resume_memory", + self.resume_memory, + methods=["POST"]) + self.app.add_api_route("/update_weights", + self.update_weights, + methods=["POST"]) async def health(self) -> Response: if self._check_health(): @@ -990,6 +1013,20 @@ class OpenAIServer: return JSONResponse(content={"detail": "None"}) + async def release_memory(self, request: MemoryUpdateRequest) -> JSONResponse: + assert isinstance(self.llm, AsyncLLM), "/release_memory endpoint is only supported with AsyncLLM()" + await self.llm.collective_rpc('sleep', args=(request.tags,)) + return JSONResponse(content={"status": "success"}) + + async def resume_memory(self, request: MemoryUpdateRequest) -> JSONResponse: + assert isinstance(self.llm, AsyncLLM), "/resume_memory endpoint is only supported with AsyncLLM()" + await self.llm.collective_rpc('wakeup', args=(request.tags,)) + return JSONResponse(content={"status": "success"}) + + async def update_weights(self, request: UpdateWeightsRequest) -> JSONResponse: + assert isinstance(self.llm, AsyncLLM), "/update_weights endpoint is only supported with AsyncLLM()" + await self.llm.collective_rpc('update_weights', args=(request.weights,)) + return JSONResponse(content={"status": "success"}) async def __call__(self, host, port, sockets: list[socket.socket] | None = None): # Store the binding address for server registration diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 530b9cf5f1..a239bf32d6 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -279,8 +279,25 @@ l0_dgx_h100: tests: - unittest/_torch/ray_orchestrator/multi_gpu -m "gpu2" - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu2" + - unittest/llmapi/test_async_llm.py -m "gpu2" - accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray - examples/test_ray.py::test_llm_inference_distributed_ray[tp2] - examples/test_ray.py::test_llm_inference_distributed_ray[pp2] - examples/test_ray.py::test_llm_inference_distributed_ray[tep2] - examples/test_ray.py::test_ray_disaggregated_serving[tp1] +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: pytorch + orchestrator: ray + tests: + - unittest/_torch/ray_orchestrator/multi_gpu -m "gpu4" + - unittest/llmapi/test_async_llm.py -m "gpu4" diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index f3dc84e81e..d7906b794f 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -144,6 +144,7 @@ l0_h100: - unittest/_torch/executor - unittest/_torch/ray_orchestrator/single_gpu - unittest/llmapi/test_llm_pytorch.py + - unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)" - examples/test_ray.py::test_llm_inference_async_ray - condition: ranges: diff --git a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_executor.py b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_executor.py index bea4f94d71..578be1f6dd 100644 --- a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_executor.py +++ b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_executor.py @@ -9,27 +9,23 @@ from utils.llm_data import llm_models_root from tensorrt_llm import LLM from tensorrt_llm._torch.utils import get_device_uuid from tensorrt_llm.llmapi import KvCacheConfig +from tensorrt_llm.llmapi.llm_args import RayPlacementConfig -class DummyWorkerExtension: - - def additional_method(self): - return "SUCCESS" - - +@pytest.mark.gpu2 def test_worker_extension(): llm = LLM(model=llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0", - ray_worker_extension_cls="test_executor.DummyWorkerExtension", - orchestrator_type="ray") - result = llm._collective_rpc("additional_method") - assert result[0] == "SUCCESS" + ray_worker_extension_cls= + "tensorrt_llm.llmapi.rlhf_utils.WorkerExtension", + orchestrator_type="ray", + tensor_parallel_size=2) + result = llm._collective_rpc("check_weights_updated") + assert isinstance(result[0], bool) @pytest.mark.gpu4 -def test_bundle_indices(monkeypatch): - """Placement via bundle indices""" - +def test_placement_env_vars(monkeypatch): monkeypatch.setenv("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1") pg = None @@ -77,6 +73,52 @@ def test_bundle_indices(monkeypatch): ray.shutdown() +@pytest.mark.gpu2 +@pytest.mark.threadleak(enabled=False) +@pytest.mark.parametrize("n_gpus,bundle_indices", [ + (2, [1]), +], + ids=["gpu2_tp1"]) +def test_placement_api(monkeypatch, n_gpus, bundle_indices): + monkeypatch.setenv("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1") + + tp_size = n_gpus // 2 + pg = None + try: + ray.init() + pg = placement_group([{"GPU": 1, "CPU": 1}] * n_gpus) + ray.get(pg.ready()) + print(f"Placement group ready with bundles {pg.bundle_specs}") + + llm = LLM( + model=os.path.join(llm_models_root(), "llama-models-v2", + "TinyLlama-1.1B-Chat-v1.0"), + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.1), + tensor_parallel_size=tp_size, + orchestrator_type="ray", + ray_placement_config=RayPlacementConfig( + placement_groups=[pg], + placement_bundle_indices=[bundle_indices], + per_worker_gpu_share=0.8, + ), + ) + + inference_actor_uuids = llm._collective_rpc("report_device_id") + expected_uuids = [get_device_uuid(idx) for idx in bundle_indices] + + print( + f"{inference_actor_uuids=}, all_uuids={[get_device_uuid(i) for i in range(n_gpus)]}" + ) + + assert sorted(inference_actor_uuids) == sorted(expected_uuids), \ + f"Workers not placed on expected GPUs. Expected: {expected_uuids}, Got: {inference_actor_uuids}" + + finally: + if pg is not None: + remove_placement_group(pg) + ray.shutdown() + + @pytest.mark.gpu2 def test_cuda_visible_device(monkeypatch): """Placement via cuda_visible_device""" diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index e316d45a81..6f2066ee59 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -211,6 +211,10 @@ methods: annotation: Optional[str] default: null status: prototype + ray_placement_config: + annotation: Optional[tensorrt_llm.llmapi.llm_args.RayPlacementConfig] + default: null + status: prototype enable_sleep: annotation: bool default: False diff --git a/tests/unittest/llmapi/test_async_llm.py b/tests/unittest/llmapi/test_async_llm.py new file mode 100644 index 0000000000..e0e7dd6d0f --- /dev/null +++ b/tests/unittest/llmapi/test_async_llm.py @@ -0,0 +1,137 @@ +import os + +import pytest +import ray +from ray.util.placement_group import placement_group, remove_placement_group +from utils.llm_data import llm_models_root +from utils.util import get_current_process_gpu_memory + +from tensorrt_llm import AsyncLLM +from tensorrt_llm._torch.utils import get_device_uuid +from tensorrt_llm._torch.virtual_memory import ExecutorMemoryType +from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams + + +@pytest.mark.ray +@pytest.mark.asyncio +async def test_async_llm_awaitable(): + llama_model_path = str(llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0") + kv_cache_config = KvCacheConfig(enable_block_reuse=False) + + prompt = "The future of AI is" + sampling_params = SamplingParams(temperature=0, max_tokens=12) + + llm = await AsyncLLM( + model=llama_model_path, + enable_sleep=True, + cuda_graph_config=None, + kv_cache_config=kv_cache_config, + ) + + output = await llm.generate_async(prompt, sampling_params) + assert output.outputs[0].text + print("Output text:", output.outputs[0].text) + + del llm + + +@pytest.mark.ray +@pytest.mark.gpu2 +@pytest.mark.asyncio +@pytest.mark.parametrize("num_cycles", [3], ids=lambda x: f"{x}_cycle") +async def test_async_llm_release_resume(process_gpu_memory_info_available, num_cycles): + llama_model_path = str(llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0") + kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=4096) + + prompt = "The future of AI is" + sampling_params = SamplingParams(temperature=0, max_tokens=12) + tags = [tag.value for tag in ExecutorMemoryType] + + async with AsyncLLM( + model=llama_model_path, + enable_sleep=True, + cuda_graph_config=None, + kv_cache_config=kv_cache_config, + tensor_parallel_size=2, + ) as llm: + # Generate baseline + output_before = await llm.generate_async(prompt, sampling_params) + baseline_text = output_before.outputs[0].text + + for cycle in range(num_cycles): + memory_usage_active = get_current_process_gpu_memory(True) / 1024**3 + print(f"[Cycle {cycle + 1}] Memory usage before release: {memory_usage_active:.2f} GB") + + await llm.release(tags) + memory_usage_released = get_current_process_gpu_memory(True) / 1024**3 + + if process_gpu_memory_info_available: + print( + f"[Cycle {cycle + 1}] Memory usage after release: {memory_usage_released:.2f} GB" + ) + assert memory_usage_released < memory_usage_active, ( + f"Released memory ({memory_usage_released:.2f} GB) should be < " + f"active memory ({memory_usage_active:.2f} GB)" + ) + + await llm.resume(tags) + memory_usage_resumed = get_current_process_gpu_memory(True) / 1024**3 + print(f"[Cycle {cycle + 1}] Memory usage after resume: {memory_usage_resumed:.2f} GB") + if process_gpu_memory_info_available: + assert memory_usage_resumed > memory_usage_released, ( + f"Resumed memory ({memory_usage_resumed:.2f} GB) should be > " + f"released memory ({memory_usage_released:.2f} GB)" + ) + + output_after = await llm.generate_async(prompt, sampling_params) + text_after = output_after.outputs[0].text + + print(f"[Cycle {num_cycles}] Generated text after release/resume: {text_after}") + assert baseline_text == text_after, ( + f"Generated text mismatch after {num_cycles} cycle(s): " + f"'{baseline_text}' != '{text_after}'" + ) + + +@pytest.mark.ray +@pytest.mark.gpu4 +@pytest.mark.asyncio +@pytest.mark.threadleak(enabled=False) +async def test_async_llm_placement_api(monkeypatch): + monkeypatch.setenv("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1") + + n_gpus = 4 + bundle_indices = [2, 3] + tp_size = len(bundle_indices) + + pg = None + try: + ray.init() + pg = placement_group([{"GPU": 1, "CPU": 1}] * n_gpus) + ray.get(pg.ready()) + print(f"Placement group ready with bundles {pg.bundle_specs}") + + llm = await AsyncLLM( + model=os.path.join( + str(llm_models_root()), "llama-models-v2", "TinyLlama-1.1B-Chat-v1.0" + ), + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.1), + tensor_parallel_size=tp_size, + placement_groups=[pg], + placement_bundle_indices=[bundle_indices], + per_worker_gpu_share=0.8, + ) + + inference_actor_uuids = await llm.collective_rpc("report_device_id") + expected_uuids = [get_device_uuid(idx) for idx in bundle_indices] + + print(f"{inference_actor_uuids=}, all_uuids={[get_device_uuid(i) for i in range(n_gpus)]}") + + assert sorted(inference_actor_uuids) == sorted(expected_uuids), ( + f"Workers not placed on expected GPUs. Expected: {expected_uuids}, Got: {inference_actor_uuids}" + ) + + finally: + if pg is not None: + remove_placement_group(pg) + ray.shutdown() From 24f92721f25a19c77d9128a34c3a72f3a10533e9 Mon Sep 17 00:00:00 2001 From: Simeng Liu <109828133+SimengLiu-nv@users.noreply.github.com> Date: Thu, 11 Dec 2025 10:29:30 -0800 Subject: [PATCH 076/172] [https://nvbugs/5597647][ci] Unwaive fixed tests. (#9812) Signed-off-by: SimengLiu-nv --- tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index 524fed462e..54334c4ec0 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -164,7 +164,6 @@ def row_linear_residual_norm_fusion_forward( ) -@pytest.mark.skip(reason="https://nvbugs/5597647") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="needs 2 GPUs to run this test") @pytest.mark.parametrize( From fd1270b9ab4b09836bb42479fb147857965c5285 Mon Sep 17 00:00:00 2001 From: Venky <23023424+venkywonka@users.noreply.github.com> Date: Thu, 11 Dec 2025 12:00:03 -0800 Subject: [PATCH 077/172] [TRTC-43] [feat] Add config db and docs (#9420) Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> Signed-off-by: Venky Ganesh <23023424+venkywonka@users.noreply.github.com> Co-authored-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- .gitignore | 2 + docs/source/deployment-guide/config_table.rst | 1074 +++++++++++++++ ...loyment-guide-for-deepseek-r1-on-trtllm.md | 30 +- .../deployment-guide-for-gpt-oss-on-trtllm.md | 22 +- ...oyment-guide-for-llama3.3-70b-on-trtllm.md | 4 +- ...oyment-guide-for-llama4-scout-on-trtllm.md | 4 +- ...ployment-guide-for-qwen3-next-on-trtllm.md | 4 +- .../deployment-guide-for-qwen3-on-trtllm.md | 4 +- docs/source/deployment-guide/index.rst | 60 +- .../source/deployment-guide/note_sections.rst | 36 + .../{ => curated}/deepseek-r1-deepgemm.yaml | 0 .../{ => curated}/deepseek-r1-latency.yaml | 0 .../{ => curated}/deepseek-r1-throughput.yaml | 0 .../{ => curated}/gpt-oss-120b-latency.yaml | 0 .../gpt-oss-120b-throughput.yaml | 0 .../configs/{ => curated}/llama-3.3-70b.yaml | 0 .../configs/{ => curated}/llama-4-scout.yaml | 0 .../{ => curated}/qwen3-disagg-prefill.yaml | 0 .../configs/{ => curated}/qwen3-next.yaml | 0 examples/configs/{ => curated}/qwen3.yaml | 0 examples/configs/database/database.py | 64 + .../B200/1k1k_tp8_conc16.yaml | 18 + .../B200/1k1k_tp8_conc32.yaml | 18 + .../DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml | 18 + .../B200/1k1k_tp8_conc64.yaml | 18 + .../DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml | 18 + .../B200/8k1k_tp8_conc16.yaml | 18 + .../B200/8k1k_tp8_conc32.yaml | 18 + .../DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml | 18 + .../B200/8k1k_tp8_conc64.yaml | 22 + .../DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml | 18 + .../H200/1k1k_tp8_conc16.yaml | 18 + .../H200/1k1k_tp8_conc32.yaml | 18 + .../DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml | 18 + .../H200/1k1k_tp8_conc64.yaml | 18 + .../DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml | 18 + .../H200/8k1k_tp8_conc16.yaml | 18 + .../H200/8k1k_tp8_conc32.yaml | 18 + .../DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml | 18 + .../H200/8k1k_tp8_conc64.yaml | 22 + .../DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml | 18 + examples/configs/database/lookup.yaml | 1176 +++++++++++++++++ .../B200/1k1k_tp4_conc128.yaml | 18 + .../B200/1k1k_tp4_conc16.yaml | 18 + .../B200/1k1k_tp4_conc256.yaml | 22 + .../B200/1k1k_tp4_conc32.yaml | 18 + .../B200/1k1k_tp4_conc4.yaml | 18 + .../B200/1k1k_tp4_conc64.yaml | 18 + .../B200/1k1k_tp4_conc8.yaml | 18 + .../B200/1k1k_tp8_conc128.yaml | 18 + .../B200/1k1k_tp8_conc16.yaml | 18 + .../B200/1k1k_tp8_conc256.yaml | 22 + .../B200/1k1k_tp8_conc32.yaml | 18 + .../B200/1k1k_tp8_conc4.yaml | 18 + .../B200/1k1k_tp8_conc64.yaml | 18 + .../B200/1k1k_tp8_conc8.yaml | 18 + .../B200/8k1k_tp4_conc128.yaml | 22 + .../B200/8k1k_tp4_conc16.yaml | 18 + .../B200/8k1k_tp4_conc256.yaml | 22 + .../B200/8k1k_tp4_conc32.yaml | 18 + .../B200/8k1k_tp4_conc4.yaml | 18 + .../B200/8k1k_tp4_conc64.yaml | 22 + .../B200/8k1k_tp4_conc8.yaml | 18 + .../B200/8k1k_tp8_conc128.yaml | 22 + .../B200/8k1k_tp8_conc16.yaml | 18 + .../B200/8k1k_tp8_conc256.yaml | 22 + .../B200/8k1k_tp8_conc32.yaml | 18 + .../B200/8k1k_tp8_conc4.yaml | 18 + .../B200/8k1k_tp8_conc64.yaml | 22 + .../B200/8k1k_tp8_conc8.yaml | 18 + .../gpt-oss-120b/B200/1k1k_tp1_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp1_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp1_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp1_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp1_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp2_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp2_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp2_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp2_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp2_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp4_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp4_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp4_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp4_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp4_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp8_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp8_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp8_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp8_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k1k_tp8_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp1_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp1_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp1_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp1_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp1_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp2_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp2_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp2_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp2_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp2_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp4_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp4_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp4_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp4_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp4_conc8.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp8_conc16.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp8_conc32.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp8_conc4.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp8_conc64.yaml | 22 + .../gpt-oss-120b/B200/1k8k_tp8_conc8.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp1_conc16.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp1_conc32.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp1_conc4.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp1_conc64.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp1_conc8.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp2_conc16.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp2_conc32.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp2_conc4.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp2_conc64.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp2_conc8.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp4_conc16.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp4_conc32.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp4_conc4.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp4_conc64.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp4_conc8.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp8_conc16.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp8_conc32.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp8_conc4.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp8_conc64.yaml | 22 + .../gpt-oss-120b/B200/8k1k_tp8_conc8.yaml | 22 + .../gpt-oss-120b/H200/1k1k_tp1_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp1_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp1_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp1_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp1_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp2_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp2_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp2_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp2_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp2_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp4_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp4_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp4_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp4_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp4_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp8_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp8_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp8_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp8_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k1k_tp8_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp1_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp1_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp1_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp1_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp1_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp2_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp2_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp2_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp2_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp2_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp4_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp4_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp4_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp4_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp4_conc8.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp8_conc16.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp8_conc32.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp8_conc4.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp8_conc64.yaml | 21 + .../gpt-oss-120b/H200/1k8k_tp8_conc8.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp1_conc16.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp1_conc32.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp1_conc4.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp1_conc64.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp1_conc8.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp2_conc16.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp2_conc32.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp2_conc4.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp2_conc64.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp2_conc8.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp4_conc16.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp4_conc32.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp4_conc4.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp4_conc64.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp4_conc8.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp8_conc16.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp8_conc32.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp8_conc4.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp8_conc64.yaml | 21 + .../gpt-oss-120b/H200/8k1k_tp8_conc8.yaml | 21 + examples/models/core/qwen/README.md | 6 +- scripts/generate_config_table.py | 169 +++ tensorrt_llm/llmapi/llm_args.py | 10 +- tests/unittest/llmapi/test_config_database.py | 64 + .../tools/test_generate_config_table.py | 66 + 195 files changed, 6234 insertions(+), 45 deletions(-) create mode 100644 docs/source/deployment-guide/config_table.rst create mode 100644 docs/source/deployment-guide/note_sections.rst rename examples/configs/{ => curated}/deepseek-r1-deepgemm.yaml (100%) rename examples/configs/{ => curated}/deepseek-r1-latency.yaml (100%) rename examples/configs/{ => curated}/deepseek-r1-throughput.yaml (100%) rename examples/configs/{ => curated}/gpt-oss-120b-latency.yaml (100%) rename examples/configs/{ => curated}/gpt-oss-120b-throughput.yaml (100%) rename examples/configs/{ => curated}/llama-3.3-70b.yaml (100%) rename examples/configs/{ => curated}/llama-4-scout.yaml (100%) rename examples/configs/{ => curated}/qwen3-disagg-prefill.yaml (100%) rename examples/configs/{ => curated}/qwen3-next.yaml (100%) rename examples/configs/{ => curated}/qwen3.yaml (100%) create mode 100644 examples/configs/database/database.py create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/lookup.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml create mode 100644 examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml create mode 100644 scripts/generate_config_table.py create mode 100644 tests/unittest/llmapi/test_config_database.py create mode 100644 tests/unittest/tools/test_generate_config_table.py diff --git a/.gitignore b/.gitignore index ccecb77e98..712957ddd5 100644 --- a/.gitignore +++ b/.gitignore @@ -55,6 +55,8 @@ tensorrt_llm/scripts *docs/source/_cpp_gen* docs/source/**/*.rst !docs/source/examples/index.rst +!docs/source/deployment-guide/config_table.rst +!docs/source/deployment-guide/note_sections.rst *.swp # Testing diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst new file mode 100644 index 0000000000..d28fed25a8 --- /dev/null +++ b/docs/source/deployment-guide/config_table.rst @@ -0,0 +1,1074 @@ +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. start-deepseek-ai/DeepSeek-R1-0528 + +.. _deepseek-ai/DeepSeek-R1-0528: + +`DeepSeek-R1 `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :width: 100% + :header-rows: 1 + :widths: 12 15 15 13 20 25 + + * - GPU + - Performance Profile + - ISL / OSL + - Concurrency + - Config + - Command + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml`` + * - 8xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml`` + +.. end-deepseek-ai/DeepSeek-R1-0528 + +.. start-nvidia/DeepSeek-R1-0528-FP4-v2 + +.. _nvidia/DeepSeek-R1-0528-FP4-v2: + +`DeepSeek-R1 (NVFP4) `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :width: 100% + :header-rows: 1 + :widths: 12 15 15 13 20 25 + + * - GPU + - Performance Profile + - ISL / OSL + - Concurrency + - Config + - Command + * - 4xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp4_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + * - 4xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 128 + - `8k1k_tp4_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 128 + - `8k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml`` + +.. end-nvidia/DeepSeek-R1-0528-FP4-v2 + +.. start-openai/gpt-oss-120b + +.. _openai/gpt-oss-120b: + +`gpt-oss-120b `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :width: 100% + :header-rows: 1 + :widths: 12 15 15 13 20 25 + + * - GPU + - Performance Profile + - ISL / OSL + - Concurrency + - Config + - Command + * - B200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + * - B200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + * - B200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + * - B200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + * - B200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` + * - B200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` + * - 2xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` + * - B200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` + * - 2xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` + * - H200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` + * - H200_SXM + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` + * - H200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + * - 2xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + * - 8xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` + * - H200_SXM + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml`` + +.. end-openai/gpt-oss-120b diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md index 3a25c5c752..e4165eac09 100644 --- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md @@ -66,7 +66,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -74,7 +74,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/deepseek-r1-throughput.yaml +```{literalinclude} ../../../examples/configs/curated/deepseek-r1-throughput.yaml --- language: shell prepend: | @@ -90,7 +90,7 @@ To use the `DeepGEMM` MOE backend on B200/GB200, use this config instead: ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -98,7 +98,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/deepseek-r1-deepgemm.yaml +```{literalinclude} ../../../examples/configs/curated/deepseek-r1-deepgemm.yaml --- language: shell prepend: | @@ -154,7 +154,7 @@ These options provide control over TensorRT LLM's behavior and are set within th #### `trust_remote_code` - **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. +* **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. #### `kv_cache_config` @@ -429,3 +429,23 @@ $$ $$ \text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} $$ + +## Preconfigured Recipes + +The following tables list recommended configurations from the comprehensive database for different performance profiles. + +```{eval-rst} +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. include:: config_table.rst + :start-after: .. start-deepseek-ai/DeepSeek-R1-0528 + :end-before: .. end-deepseek-ai/DeepSeek-R1-0528 +``` + +```{eval-rst} +.. include:: config_table.rst + :start-after: .. start-nvidia/DeepSeek-R1-0528-FP4-v2 + :end-before: .. end-nvidia/DeepSeek-R1-0528-FP4-v2 +``` diff --git a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md index 86fc4bc786..5a9f9f4c72 100644 --- a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md @@ -64,7 +64,7 @@ For low-latency use cases: ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -72,7 +72,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/gpt-oss-120b-latency.yaml +```{literalinclude} ../../../examples/configs/curated/gpt-oss-120b-latency.yaml --- language: shell prepend: | @@ -88,7 +88,7 @@ For max-throughput use cases: ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -96,7 +96,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/gpt-oss-120b-throughput.yaml +```{literalinclude} ../../../examples/configs/curated/gpt-oss-120b-throughput.yaml --- language: shell prepend: | @@ -377,3 +377,17 @@ $$ $$ \text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} $$ + +## Preconfigured Recipes + +The following table lists recommended configurations from the comprehensive database for different performance profiles. + +```{eval-rst} +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. include:: config_table.rst + :start-after: .. start-openai/gpt-oss-120b + :end-before: .. end-openai/gpt-oss-120b +``` diff --git a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md index 583ef56b49..d3e328d810 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md @@ -58,7 +58,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -66,7 +66,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/llama-3.3-70b.yaml +```{literalinclude} ../../../examples/configs/curated/llama-3.3-70b.yaml --- language: shell prepend: | diff --git a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md index 10db2e128f..7d69b7a8be 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md @@ -57,7 +57,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -65,7 +65,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/llama-4-scout.yaml +```{literalinclude} ../../../examples/configs/curated/llama-4-scout.yaml --- language: shell prepend: | diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md index 246fc74a56..46bf724b71 100644 --- a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md @@ -35,7 +35,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3-next.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -43,7 +43,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/qwen3-next.yaml +```{literalinclude} ../../../examples/configs/curated/qwen3-next.yaml --- language: shell prepend: | diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md index 190740ebd8..894c6a1e63 100644 --- a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md @@ -40,7 +40,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -48,7 +48,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/qwen3.yaml +```{literalinclude} ../../../examples/configs/curated/qwen3.yaml --- language: shell prepend: | diff --git a/docs/source/deployment-guide/index.rst b/docs/source/deployment-guide/index.rst index ed7fd9c536..644a9d9ae9 100644 --- a/docs/source/deployment-guide/index.rst +++ b/docs/source/deployment-guide/index.rst @@ -6,15 +6,20 @@ Quick Start for Popular Models The table below contains ``trtllm-serve`` commands that can be used to easily deploy popular models including DeepSeek-R1, gpt-oss, Llama 4, Qwen3, and more. -We maintain LLM API configuration files for these models containing recommended performance settings in the `examples/configs `_ directory. The TensorRT LLM Docker container makes the config files available at ``/app/tensorrt_llm/examples/configs``, but you can customize this as needed: +We maintain LLM API configuration files for these models containing recommended performance settings in two locations: + +* **Curated Examples**: `examples/configs/curated `_ - Hand-picked configurations for common scenarios. +* **Comprehensive Database**: `examples/configs/database `_ - A more comprehensive set of known-good configurations for various GPUs and traffic patterns. + +The TensorRT LLM Docker container makes these config files available at ``/app/tensorrt_llm/examples/configs/curated`` and ``/app/tensorrt_llm/examples/configs/database`` respectively. You can reference them as needed: .. code-block:: bash export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment -.. note:: - - The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, you may benefit from additional tuning. In the future, we plan to provide more configs for a wider range of traffic patterns. +.. include:: note_sections.rst + :start-after: .. start-note-quick-start-isl-osl + :end-before: .. end-note-quick-start-isl-osl This table is designed to provide a straightforward starting point; for detailed model-specific deployment guides, check out the guides below. @@ -30,53 +35,53 @@ This table is designed to provide a straightforward starting point; for detailed * - `DeepSeek-R1 `_ - H100, H200 - Max Throughput - - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml`` + - `deepseek-r1-throughput.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 `_ - B200, GB200 - Max Throughput - - `deepseek-r1-deepgemm.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml`` + - `deepseek-r1-deepgemm.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Max Throughput - - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml`` + - `deepseek-r1-throughput.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Min Latency - - `deepseek-r1-latency.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-latency.yaml`` + - `deepseek-r1-latency.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml`` * - `gpt-oss-120b `_ - Any - Max Throughput - - `gpt-oss-120b-throughput.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml`` + - `gpt-oss-120b-throughput.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml`` * - `gpt-oss-120b `_ - Any - Min Latency - - `gpt-oss-120b-latency.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml`` + - `gpt-oss-120b-latency.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml`` * - `Qwen3-Next-80B-A3B-Thinking `_ - Any - Max Throughput - - `qwen3-next.yaml `_ - - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/qwen3-next.yaml`` + - `qwen3-next.yaml `_ + - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml`` * - Qwen3 family (e.g. `Qwen3-30B-A3B `_) - Any - Max Throughput - - `qwen3.yaml `_ - - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/qwen3.yaml`` (swap to another Qwen3 model name as needed) + - `qwen3.yaml `_ + - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed) * - `Llama-3.3-70B (FP8) `_ - Any - Max Throughput - - `llama-3.3-70b.yaml `_ - - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml`` + - `llama-3.3-70b.yaml `_ + - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml`` * - `Llama 4 Scout (FP8) `_ - Any - Max Throughput - - `llama-4-scout.yaml `_ - - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml`` + - `llama-4-scout.yaml `_ + - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml`` Model-Specific Deployment Guides --------------------------------- @@ -94,3 +99,10 @@ The deployment guides below provide more detailed instructions for serving speci deployment-guide-for-qwen3-on-trtllm.md deployment-guide-for-qwen3-next-on-trtllm.md deployment-guide-for-kimi-k2-thinking-on-trtllm.md + +Comprehensive Configuration Database +------------------------------------ + +The table below lists all available pre-configured model scenarios in the TensorRT LLM configuration database. Each row represents a specific model, GPU, and performance profile combination with recommended request settings. + +.. include:: config_table.rst diff --git a/docs/source/deployment-guide/note_sections.rst b/docs/source/deployment-guide/note_sections.rst new file mode 100644 index 0000000000..4cd0d1c41d --- /dev/null +++ b/docs/source/deployment-guide/note_sections.rst @@ -0,0 +1,36 @@ +.. + Reusable note sections for deployment guides. + Include specific notes using: + + .. include:: note_sections.rst + :start-after: .. start-note- + :end-before: .. end-note- + +.. start-note-traffic-patterns + +.. note:: + + **Traffic Patterns**: The ISL (Input Sequence Length) and OSL (Output Sequence Length) + values in each configuration represent the **maximum supported values** for that config. + Requests exceeding these limits may result in errors. + + To handle requests with input sequences **longer than the configured ISL**, add the following + to your config file: + + .. code-block:: yaml + + enable_chunked_prefill: true + + This enables chunked prefill, which processes long input sequences in chunks rather than + requiring them to fit within a single prefill operation. Note that enabling chunked prefill + does **not** guarantee optimal performance—these configs are tuned for the specified ISL/OSL. + +.. end-note-traffic-patterns + +.. start-note-quick-start-isl-osl + +.. note:: + + The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Comprehensive Configuration Database` section below which covers a larger set of traffic patterns and performance profiles. + +.. end-note-quick-start-isl-osl diff --git a/examples/configs/deepseek-r1-deepgemm.yaml b/examples/configs/curated/deepseek-r1-deepgemm.yaml similarity index 100% rename from examples/configs/deepseek-r1-deepgemm.yaml rename to examples/configs/curated/deepseek-r1-deepgemm.yaml diff --git a/examples/configs/deepseek-r1-latency.yaml b/examples/configs/curated/deepseek-r1-latency.yaml similarity index 100% rename from examples/configs/deepseek-r1-latency.yaml rename to examples/configs/curated/deepseek-r1-latency.yaml diff --git a/examples/configs/deepseek-r1-throughput.yaml b/examples/configs/curated/deepseek-r1-throughput.yaml similarity index 100% rename from examples/configs/deepseek-r1-throughput.yaml rename to examples/configs/curated/deepseek-r1-throughput.yaml diff --git a/examples/configs/gpt-oss-120b-latency.yaml b/examples/configs/curated/gpt-oss-120b-latency.yaml similarity index 100% rename from examples/configs/gpt-oss-120b-latency.yaml rename to examples/configs/curated/gpt-oss-120b-latency.yaml diff --git a/examples/configs/gpt-oss-120b-throughput.yaml b/examples/configs/curated/gpt-oss-120b-throughput.yaml similarity index 100% rename from examples/configs/gpt-oss-120b-throughput.yaml rename to examples/configs/curated/gpt-oss-120b-throughput.yaml diff --git a/examples/configs/llama-3.3-70b.yaml b/examples/configs/curated/llama-3.3-70b.yaml similarity index 100% rename from examples/configs/llama-3.3-70b.yaml rename to examples/configs/curated/llama-3.3-70b.yaml diff --git a/examples/configs/llama-4-scout.yaml b/examples/configs/curated/llama-4-scout.yaml similarity index 100% rename from examples/configs/llama-4-scout.yaml rename to examples/configs/curated/llama-4-scout.yaml diff --git a/examples/configs/qwen3-disagg-prefill.yaml b/examples/configs/curated/qwen3-disagg-prefill.yaml similarity index 100% rename from examples/configs/qwen3-disagg-prefill.yaml rename to examples/configs/curated/qwen3-disagg-prefill.yaml diff --git a/examples/configs/qwen3-next.yaml b/examples/configs/curated/qwen3-next.yaml similarity index 100% rename from examples/configs/qwen3-next.yaml rename to examples/configs/curated/qwen3-next.yaml diff --git a/examples/configs/qwen3.yaml b/examples/configs/curated/qwen3.yaml similarity index 100% rename from examples/configs/qwen3.yaml rename to examples/configs/curated/qwen3.yaml diff --git a/examples/configs/database/database.py b/examples/configs/database/database.py new file mode 100644 index 0000000000..e0c73a8ef1 --- /dev/null +++ b/examples/configs/database/database.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path +from typing import Any, Dict, Iterator, List + +import yaml +from pydantic import BaseModel, Field, RootModel + +DATABASE_LIST_PATH = Path(__file__).parent / "lookup.yaml" + + +class RecipeConstraints(BaseModel): + """Recipe record for scenario list.""" + + model: str = Field(description="Model name") + gpu: str = Field(description="GPU name") + isl: int = Field(description="Input sequence length") + osl: int = Field(description="Output sequence length") + concurrency: int = Field(description="Concurrency") + config_path: str = Field(description="Configuration path") + num_gpus: int = Field(description="Number of GPUs") + + def load_config(self) -> Dict[str, Any]: + """Load and return the YAML config at config_path.""" + with open(self.config_path) as f: + data = yaml.safe_load(f) + return data if data is not None else {} + + +class Recipe(BaseModel): + """Recipe that describes a single scenario.""" + + constraints: RecipeConstraints = Field(description="Recipe constraints") + env_overrides: Dict[str, Any] = Field(description="Environment overrides", default_factory=dict) + config: Dict[str, Any] = Field(description="Configuration overrides", default_factory=dict) + + +class RecipeList(RootModel[List[RecipeConstraints]]): + @classmethod + def from_yaml(cls, yaml_path: Path) -> "RecipeList": + """Load and validate recipe list from YAML file.""" + with open(yaml_path) as f: + data = yaml.safe_load(f) + return cls(data) + + def __iter__(self) -> Iterator[RecipeConstraints]: + return iter(self.root) + + def __len__(self) -> int: + return len(self.root) diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..f770a6566e --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..f770a6566e --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..f770a6566e --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..f770a6566e --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..f770a6566e --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..6660bcea96 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..6660bcea96 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..6660bcea96 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..919a028409 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..6660bcea96 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: DEEPGEMM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..008da1df54 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..008da1df54 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..008da1df54 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..008da1df54 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..008da1df54 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..decbb1744a --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..decbb1744a --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..decbb1744a --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..363eebf521 --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..decbb1744a --- /dev/null +++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/lookup.yaml b/examples/configs/database/lookup.yaml new file mode 100644 index 0000000000..d1ac7143ce --- /dev/null +++ b/examples/configs/database/lookup.yaml @@ -0,0 +1,1176 @@ +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 128 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 256 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 128 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 256 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 128 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 256 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml + num_gpus: 4 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 128 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 256 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml + num_gpus: 8 +- model: nvidia/DeepSeek-R1-0528-FP4-v2 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml + num_gpus: 8 +- model: deepseek-ai/DeepSeek-R1-0528 + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: B200_NVL + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 1024 + osl: 8192 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml + num_gpus: 1 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml + num_gpus: 2 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml + num_gpus: 4 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 16 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 32 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 4 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 64 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml + num_gpus: 8 +- model: openai/gpt-oss-120b + gpu: H200_SXM + isl: 8192 + osl: 1024 + concurrency: 8 + config_path: examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml + num_gpus: 8 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml new file mode 100644 index 0000000000..c61e3abc15 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1216 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml new file mode 100644 index 0000000000..fe58a6a32b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml new file mode 100644 index 0000000000..2a06d3978d --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1344 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml new file mode 100644 index 0000000000..fe58a6a32b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml new file mode 100644 index 0000000000..fe58a6a32b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml new file mode 100644 index 0000000000..fe58a6a32b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml new file mode 100644 index 0000000000..fe58a6a32b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml new file mode 100644 index 0000000000..a4a4fe28c7 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1216 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..397565e15b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml new file mode 100644 index 0000000000..686db04f1f --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1344 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..397565e15b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..397565e15b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..397565e15b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..397565e15b --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 1152 +max_seq_len: 2068 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml new file mode 100644 index 0000000000..ace419c0d8 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8384 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml new file mode 100644 index 0000000000..a0f2de5fec --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml new file mode 100644 index 0000000000..3c812ea3e9 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8512 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml new file mode 100644 index 0000000000..a0f2de5fec --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml new file mode 100644 index 0000000000..a0f2de5fec --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml new file mode 100644 index 0000000000..06f600c1cd --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml new file mode 100644 index 0000000000..a0f2de5fec --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml new file mode 100644 index 0000000000..5334ed3cf5 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8384 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..382a3c9045 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml new file mode 100644 index 0000000000..639fdde94a --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8512 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..382a3c9045 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..382a3c9045 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..930a625308 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml @@ -0,0 +1,22 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: true +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: CUTLASS +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..382a3c9045 --- /dev/null +++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml @@ -0,0 +1,18 @@ +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 8320 +max_seq_len: 9416 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml new file mode 100644 index 0000000000..1d4df97010 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml new file mode 100644 index 0000000000..7d65f54710 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml new file mode 100644 index 0000000000..ca850a7758 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml new file mode 100644 index 0000000000..345b0e5013 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml new file mode 100644 index 0000000000..5fa5e373d2 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml new file mode 100644 index 0000000000..7b392ada8d --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml new file mode 100644 index 0000000000..e8212dd139 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml new file mode 100644 index 0000000000..ab22a7baf6 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml new file mode 100644 index 0000000000..3f82650480 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml new file mode 100644 index 0000000000..b07960f33d --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml new file mode 100644 index 0000000000..e078ea3d6d --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml new file mode 100644 index 0000000000..15f5a3ca50 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml new file mode 100644 index 0000000000..cdbb40a3eb --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml new file mode 100644 index 0000000000..c5854b6daf --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml new file mode 100644 index 0000000000..0ac4431175 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..a18faa2622 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..4ce42b3ce8 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..966138c163 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..a322f0681d --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..644d2dabb4 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml new file mode 100644 index 0000000000..31544aa9f4 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml new file mode 100644 index 0000000000..ec0ea7b2ba --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml new file mode 100644 index 0000000000..249b14723f --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml new file mode 100644 index 0000000000..21de3414a8 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml new file mode 100644 index 0000000000..315b1add42 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml new file mode 100644 index 0000000000..56e1b648bd --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml new file mode 100644 index 0000000000..4e02fe671b --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml new file mode 100644 index 0000000000..4bc360839a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml new file mode 100644 index 0000000000..584fb5ae1a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml new file mode 100644 index 0000000000..6ab46126d5 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml new file mode 100644 index 0000000000..ef539d3bef --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml new file mode 100644 index 0000000000..40dc752084 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml new file mode 100644 index 0000000000..3e0f48e7e1 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml new file mode 100644 index 0000000000..2e3721c712 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml new file mode 100644 index 0000000000..098e7ec388 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml new file mode 100644 index 0000000000..45d77f70bd --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml new file mode 100644 index 0000000000..9436b07959 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml new file mode 100644 index 0000000000..a2917bfd5b --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml new file mode 100644 index 0000000000..702d3bc00c --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml new file mode 100644 index 0000000000..c0b90314c3 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml new file mode 100644 index 0000000000..31544aa9f4 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml new file mode 100644 index 0000000000..ec0ea7b2ba --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml new file mode 100644 index 0000000000..249b14723f --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml new file mode 100644 index 0000000000..21de3414a8 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml new file mode 100644 index 0000000000..315b1add42 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml new file mode 100644 index 0000000000..56e1b648bd --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml new file mode 100644 index 0000000000..4e02fe671b --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml new file mode 100644 index 0000000000..4bc360839a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml new file mode 100644 index 0000000000..584fb5ae1a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml new file mode 100644 index 0000000000..6ab46126d5 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml new file mode 100644 index 0000000000..ef539d3bef --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml new file mode 100644 index 0000000000..40dc752084 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml new file mode 100644 index 0000000000..3e0f48e7e1 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml new file mode 100644 index 0000000000..2e3721c712 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml new file mode 100644 index 0000000000..098e7ec388 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..45d77f70bd --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..9436b07959 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..a2917bfd5b --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..702d3bc00c --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..c0b90314c3 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml @@ -0,0 +1,22 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: TRTLLM +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml new file mode 100644 index 0000000000..2eea897e2f --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml new file mode 100644 index 0000000000..1a0d44fb27 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml new file mode 100644 index 0000000000..82662456f0 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml new file mode 100644 index 0000000000..57d8e2ada2 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml new file mode 100644 index 0000000000..87e34788d7 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml new file mode 100644 index 0000000000..57b4b87fc7 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml new file mode 100644 index 0000000000..0d796e4751 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml new file mode 100644 index 0000000000..f6c41d8bbd --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml new file mode 100644 index 0000000000..fdec025db8 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml new file mode 100644 index 0000000000..8565e82e36 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml new file mode 100644 index 0000000000..4773067517 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml new file mode 100644 index 0000000000..5e0d27c5ea --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml new file mode 100644 index 0000000000..9b135c0a32 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml new file mode 100644 index 0000000000..6874784b9f --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml new file mode 100644 index 0000000000..cc1d2d8ac9 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..f7e46b17a3 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..1b1b874c3e --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..28a7f3d17c --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..8036e74399 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..12289904ed --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 2068 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml new file mode 100644 index 0000000000..7ccdc4ae11 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml new file mode 100644 index 0000000000..ea6a93ba64 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml new file mode 100644 index 0000000000..a0149f2ab5 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml new file mode 100644 index 0000000000..3ae56a300a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml new file mode 100644 index 0000000000..c18bc3c758 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml new file mode 100644 index 0000000000..e88b4e05fe --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml new file mode 100644 index 0000000000..95b8e20733 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml new file mode 100644 index 0000000000..c35b691a81 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml new file mode 100644 index 0000000000..ce0f7c2757 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml new file mode 100644 index 0000000000..344166bc32 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml new file mode 100644 index 0000000000..4f895199b1 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml new file mode 100644 index 0000000000..ca549de3d2 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml new file mode 100644 index 0000000000..b87044bbc0 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml new file mode 100644 index 0000000000..9af104970e --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml new file mode 100644 index 0000000000..7440c3fcb7 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml new file mode 100644 index 0000000000..b1d8a6eead --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml new file mode 100644 index 0000000000..f8c7fec13a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml new file mode 100644 index 0000000000..f9cb8feb69 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml new file mode 100644 index 0000000000..a9124d7007 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml new file mode 100644 index 0000000000..7c2507ace7 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml new file mode 100644 index 0000000000..7ccdc4ae11 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml new file mode 100644 index 0000000000..ea6a93ba64 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml new file mode 100644 index 0000000000..a0149f2ab5 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml new file mode 100644 index 0000000000..3ae56a300a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml new file mode 100644 index 0000000000..c18bc3c758 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml new file mode 100644 index 0000000000..e88b4e05fe --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml new file mode 100644 index 0000000000..95b8e20733 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml new file mode 100644 index 0000000000..c35b691a81 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml new file mode 100644 index 0000000000..ce0f7c2757 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml new file mode 100644 index 0000000000..344166bc32 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 2 +moe_expert_parallel_size: 2 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml new file mode 100644 index 0000000000..4f895199b1 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml new file mode 100644 index 0000000000..ca549de3d2 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml new file mode 100644 index 0000000000..b87044bbc0 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml new file mode 100644 index 0000000000..9af104970e --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml new file mode 100644 index 0000000000..7440c3fcb7 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml new file mode 100644 index 0000000000..b1d8a6eead --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 16 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml new file mode 100644 index 0000000000..f8c7fec13a --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 32 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml new file mode 100644 index 0000000000..f9cb8feb69 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 4 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml new file mode 100644 index 0000000000..a9124d7007 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 64 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml new file mode 100644 index 0000000000..7c2507ace7 --- /dev/null +++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml @@ -0,0 +1,21 @@ +env_overrides: + TRTLLM_ENABLE_PDL: 1 +cuda_graph_config: + enable_padding: true + max_batch_size: 8 +enable_attention_dp: false +kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +moe_config: + backend: TRITON +num_postprocess_workers: 4 +print_iter_log: true +stream_interval: 20 +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +trust_remote_code: true +backend: pytorch +max_num_tokens: 20000 +max_seq_len: 9236 diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md index 98cc3e4bf2..051d7811f9 100644 --- a/examples/models/core/qwen/README.md +++ b/examples/models/core/qwen/README.md @@ -748,7 +748,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml ``` #### trtllm-serve @@ -779,7 +779,7 @@ For example, you can launch a single context server on port 8001 with: ```bash export TRTLLM_USE_UCX_KVCACHE=1 export TRTLLM_DIR=/app/tensorrt_llm -export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/qwen3-disagg-prefill.yaml" +export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/curated/qwen3-disagg-prefill.yaml" trtllm-serve Qwen3-30B-A3B/ --port 8001 --extra_llm_api_options ${EXTRA_LLM_API_FILE} &> output_ctx & ``` @@ -789,7 +789,7 @@ And you can launch two generation servers on port 8002 and 8003 with: ```bash export TRTLLM_USE_UCX_KVCACHE=1 export TRTLLM_DIR=/app/tensorrt_llm -export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/qwen3.yaml" +export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml" for port in {8002..8003}; do \ trtllm-serve Qwen3-30B-A3B/ --port ${port} --extra_llm_api_options ${EXTRA_LLM_API_FILE} &> output_gen_${port} & \ diff --git a/scripts/generate_config_table.py b/scripts/generate_config_table.py new file mode 100644 index 0000000000..2d423c0811 --- /dev/null +++ b/scripts/generate_config_table.py @@ -0,0 +1,169 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import sys +from collections import defaultdict +from pathlib import Path + +from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList + +SCRIPT_DIR = Path(__file__).parent.resolve() +REPO_ROOT = SCRIPT_DIR.parent +MODEL_INFO = { + "deepseek-ai/DeepSeek-R1-0528": { + "display_name": "DeepSeek-R1", + "url": "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", + }, + "nvidia/DeepSeek-R1-0528-FP4-v2": { + "display_name": "DeepSeek-R1 (NVFP4)", + "url": "https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2", + }, + "openai/gpt-oss-120b": { + "display_name": "gpt-oss-120b", + "url": "https://huggingface.co/openai/gpt-oss-120b", + }, +} + +LOW_LATENCY_CONCURRENCY_THRESHOLD = 8 +HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD = 32 + + +def generate_rst(yaml_path, output_file=None): + """Generate RST table from YAML config database. + + Args: + yaml_path: Path to lookup.yaml (str or Path) + output_file: Optional output file path. If None, prints to stdout. + """ + recipe_list = RecipeList.from_yaml(Path(yaml_path)) + + # Group by model -> (gpu, isl, osl) -> list of recipes + model_groups = defaultdict(lambda: defaultdict(list)) + for recipe in recipe_list: + key = (recipe.gpu, recipe.isl, recipe.osl) + model_groups[recipe.model][key].append(recipe) + + lines = [] + + # Include note_sections.rst at the top (relative include for Sphinx) + lines.append(".. include:: note_sections.rst") + lines.append(" :start-after: .. start-note-traffic-patterns") + lines.append(" :end-before: .. end-note-traffic-patterns") + lines.append("") + + sorted_models = sorted(model_groups.keys()) + + for model in sorted_models: + lines.append(f".. start-{model}") + lines.append("") + + if model in MODEL_INFO: + info = MODEL_INFO[model] + title_text = f"`{info['display_name']} <{info['url']}>`_" + else: + title_text = model + + lines.append(f".. _{model}:") + lines.append("") + lines.append(title_text) + lines.append("^" * len(title_text)) + lines.append("") + + lines.append(".. list-table::") + lines.append(" :width: 100%") + lines.append(" :header-rows: 1") + lines.append(" :widths: 12 15 15 13 20 25") + lines.append("") + lines.append(" * - GPU") + lines.append(" - Performance Profile") + lines.append(" - ISL / OSL") + lines.append(" - Concurrency") + lines.append(" - Config") + lines.append(" - Command") + + subgroups = model_groups[model] + sorted_keys = sorted( + subgroups.keys(), key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0)) + ) + + for key in sorted_keys: + entries = subgroups[key] + entries.sort(key=lambda x: x.concurrency) + n = len(entries) + + for idx, entry in enumerate(entries): + gpu = entry.gpu + num_gpus = entry.num_gpus + gpu_display = f"{num_gpus}x{gpu}" if num_gpus and num_gpus > 1 else gpu + isl = entry.isl + osl = entry.osl + conc = entry.concurrency + config_path = entry.config_path + + if n == 1: + if conc <= LOW_LATENCY_CONCURRENCY_THRESHOLD: + profile = "Low Latency" + elif conc >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD: + profile = "High Throughput" + else: + profile = "Balanced" + elif idx == 0: + profile = "Min Latency" + elif idx == n - 1: + profile = "Max Throughput" + elif idx in ((n - 1) // 2, n // 2): + profile = "Balanced" + elif idx < n // 2: + profile = "Low Latency" + else: + profile = "High Throughput" + + full_config_path = config_path + command = f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{full_config_path}" + + config_filename = os.path.basename(full_config_path) + + github_url = f"https://github.com/NVIDIA/TensorRT-LLM/blob/main/{full_config_path}" + config_link = f"`{config_filename} <{github_url}>`_" + + lines.append(f" * - {gpu_display}") + lines.append(f" - {profile}") + lines.append(f" - {isl} / {osl}") + lines.append(f" - {conc}") + lines.append(f" - {config_link}") + lines.append(f" - ``{command}``") + + lines.append("") + lines.append(f".. end-{model}") + lines.append("") + + output_text = "\n".join(lines) + if output_file: + with open(output_file, "w") as f: + f.write(output_text) + print(f"Generated table written to: {output_file}", file=sys.stderr) + else: + print(output_text) + + +if __name__ == "__main__": + yaml_path = DATABASE_LIST_PATH + if not yaml_path.exists(): + print(f"Error: YAML file not found at {yaml_path}", file=sys.stderr) + sys.exit(1) + output_path = REPO_ROOT / "docs/source/deployment-guide/config_table.rst" + generate_rst(yaml_path, output_file=output_path) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 9b32dbe91e..30bfa3675a 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2026,9 +2026,17 @@ class BaseLlmArgs(StrictBaseModel): env_overrides: Optional[Dict[str, str]] = Field( default=None, description= - "[EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won’t update unless the code fetches them from os.environ on demand.", + "[EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won't update unless the code fetches them from os.environ on demand.", status="prototype") + @field_validator('env_overrides', mode='before') + @classmethod + def coerce_env_overrides_to_str(cls, v): + """Coerce env_overrides values to strings for os.environ compatibility.""" + if v is None: + return v + return {str(k): str(val) for k, val in v.items()} + _parallel_config: Optional[_ParallelConfig] = PrivateAttr(default=None) _model_format: Optional[_ModelFormatKind] = PrivateAttr(default=None) _speculative_model: Optional[str] = PrivateAttr(default=None) diff --git a/tests/unittest/llmapi/test_config_database.py b/tests/unittest/llmapi/test_config_database.py new file mode 100644 index 0000000000..72dfce770f --- /dev/null +++ b/tests/unittest/llmapi/test_config_database.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""L0 tests for validating config database YAML files against TorchLlmArgs.""" + +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest +import yaml + +from tensorrt_llm.llmapi.llm_args import TorchLlmArgs, update_llm_args_with_extra_dict + +CONFIG_ROOT = Path(__file__).parents[3] / "examples" / "configs" +DATABASE_DIR = CONFIG_ROOT / "database" + +DATABASE_CONFIGS = ( + [c for c in DATABASE_DIR.rglob("*.yaml") if c.name != "lookup.yaml"] + if DATABASE_DIR.exists() + else [] +) + + +@pytest.fixture(autouse=True) +def mock_gpu_environment(): + """Mock GPU functions for CPU-only test execution.""" + mock_props = Mock() + mock_props.major = 8 + + with patch("torch.cuda.device_count", return_value=8): + with patch("torch.cuda.get_device_properties", return_value=mock_props): + with patch("torch.cuda.is_available", return_value=True): + yield + + +def get_config_id(config_path: Path) -> str: + return str(config_path.relative_to(DATABASE_DIR)) + + +@pytest.mark.part0 +@pytest.mark.parametrize("config_path", DATABASE_CONFIGS, ids=get_config_id) +def test_config_validates_against_llm_args(config_path: Path): + with open(config_path) as f: + config_dict = yaml.safe_load(f) or {} + + base_args = TorchLlmArgs(model="dummy/model", skip_tokenizer_init=True) + merged = update_llm_args_with_extra_dict(base_args.model_dump(), config_dict) + TorchLlmArgs(**merged) + + +@pytest.mark.part0 +def test_database_config_count(): + assert len(DATABASE_CONFIGS) > 0, "No database config files found" diff --git a/tests/unittest/tools/test_generate_config_table.py b/tests/unittest/tools/test_generate_config_table.py new file mode 100644 index 0000000000..a2dcf66783 --- /dev/null +++ b/tests/unittest/tools/test_generate_config_table.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tempfile +import unittest + +# Add scripts directory to path +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) +SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts") +sys.path.insert(0, SCRIPTS_DIR) + +from generate_config_table import generate_rst # noqa: E402 + + +class TestConfigTableSync(unittest.TestCase): + def test_config_table_sync(self): + """Test that the config_table.rst file is synchronized with the lookup.yaml database. + + Ensures that the RST file is up-to-date with the YAML database. + """ + if generate_rst is None: + self.skipTest("generate_config_table not available") + + # Define paths + yaml_path = os.path.join(REPO_ROOT, "examples/configs/database/lookup.yaml") + rst_path = os.path.join(REPO_ROOT, "docs/source/deployment-guide/config_table.rst") + + # Ensure files exist + self.assertTrue(os.path.exists(yaml_path), f"YAML file not found: {yaml_path}") + self.assertTrue(os.path.exists(rst_path), f"RST file not found: {rst_path}") + + # Read existing RST content + with open(rst_path, "r") as f: + existing_content = f.read() + + # Generate new RST content + with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp: + generate_rst(yaml_path, output_file=tmp.name) + tmp.seek(0) + generated_content = tmp.read() + + # Compare content + self.assertEqual( + existing_content.strip(), + generated_content.strip(), + "config_table.rst is not synchronized with lookup.yaml. " + "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.", + ) + + +if __name__ == "__main__": + unittest.main() From 95d928f07189c3cfb9b80e753d56ea389084278e Mon Sep 17 00:00:00 2001 From: Kanghwan <861393+karljang@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:43:30 -0800 Subject: [PATCH 078/172] [None][infra] Add workflow to auto-label 'waiting for feedback' on team comments (#9886) Signed-off-by: Kanghwan Jang <861393+karljang@users.noreply.github.com> --- .github/workflows/waiting_for_feedback.yml | 119 +++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 .github/workflows/waiting_for_feedback.yml diff --git a/.github/workflows/waiting_for_feedback.yml b/.github/workflows/waiting_for_feedback.yml new file mode 100644 index 0000000000..3ec6f4c7b8 --- /dev/null +++ b/.github/workflows/waiting_for_feedback.yml @@ -0,0 +1,119 @@ +name: Manage Waiting for Feedback Label + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + +permissions: + issues: write + pull-requests: write + +jobs: + manage-waiting-for-feedback: + runs-on: ubuntu-latest + if: github.repository == 'NVIDIA/TensorRT-LLM' + steps: + - name: Check membership and manage label + uses: actions/github-script@v8 + with: + script: | + const commenter = context.payload.comment.user.login; + const label = 'waiting for feedback'; + + // Handle both issue_comment and pull_request_review_comment events + // context.issue.number is only available for issue_comment events + const issueNumber = context.issue?.number || context.payload.pull_request?.number; + const issue = context.payload.issue || context.payload.pull_request; + const author = issue?.user?.login; + const isAuthor = (commenter === author); + + if (!issueNumber) { + console.log('Could not determine issue/PR number. Skipping.'); + return; + } + + console.log(`Comment by ${commenter} on #${issueNumber} (author: ${author})`); + const owner = context.repo.owner; + const repo = context.repo.repo; + + // Check if commenter is repository member + let isMember = false; + try { + await github.rest.repos.checkCollaborator({ + owner, + repo, + username: commenter + }); + isMember = true; + } catch (error) { + if (error.status === 404) { + isMember = false; + } else if (error.status === 302) { + console.log(`Cannot determine membership for ${commenter} (insufficient token permissions)`); + return; + } else { + console.error(`Error checking membership: ${error.message}`); + throw error; + } + } + + // Logic: + // - Author responds → remove label (feedback provided) + // - NVIDIA non-author comments → add label (team is waiting for response) + // - External non-author comments → remove label (someone provided feedback) + + if (isAuthor) { + // Author responded - remove 'waiting for feedback' label + console.log(`${commenter} is the author. Removing '${label}' label if present.`); + + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + name: label + }); + console.log(`Successfully removed '${label}' label from #${issueNumber}`); + } catch (error) { + if (error.status === 404) { + console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`); + } else { + throw error; + } + } + + } else if (isMember) { + // NVIDIA non-author commented - add 'waiting for feedback' label + console.log(`${commenter} is an NVIDIA member (not author). Adding '${label}' label.`); + + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + labels: [label] + }); + + console.log(`Successfully added '${label}' label to #${issueNumber}`); + + } else { + // External non-author commented - remove 'waiting for feedback' label + console.log(`${commenter} is external (not author). Removing '${label}' label if present.`); + + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + name: label + }); + console.log(`Successfully removed '${label}' label from #${issueNumber}`); + } catch (error) { + if (error.status === 404) { + console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`); + } else { + throw error; + } + } + } From 4f6d4da035817a144a8539f684f10b554417692b Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Thu, 11 Dec 2025 13:55:31 -0800 Subject: [PATCH 079/172] [None][perf] Fix TPOT when `min_tokens` set (#9862) Signed-off-by: jthomson04 --- tensorrt_llm/_torch/pyexecutor/sampler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 40d1450e45..83826eaad7 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -2230,9 +2230,14 @@ class TorchSampler(Sampler, AsyncWorkerMixin): for beam_idx in range(num_beams[index]): for step in range(num_steps[index]): if r.get_num_tokens(beam_idx) + step < r.py_min_length[0]: + # NOTE(jthomson04): We can NOT just assign logits[...] = float("-inf"). + # This introduces a pageable HtoD transfer, which wreaks havoc on TPOT (up to ~20%) + # Instead, we create a little tensor on device, then assign to that. + # This way, we avoid the pageable transfer. + neg_inf_tensor = torch.full((), float("-inf"), device=logits.device) logits[ current_offset + num_steps[index] * beam_idx + step, r.py_end_id - ] = float("-inf") + ] = neg_inf_tensor else: # early exit break From 98c68c195b7ec3562c5ff054cc4aa7b1cbd5c20b Mon Sep 17 00:00:00 2001 From: Kanghwan <861393+karljang@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:20:51 -0800 Subject: [PATCH 080/172] [None][infra] Ignore comments from bots and CI accounts (#9929) Signed-off-by: Kanghwan Jang <861393+karljang@users.noreply.github.com> --- .github/workflows/waiting_for_feedback.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/waiting_for_feedback.yml b/.github/workflows/waiting_for_feedback.yml index 3ec6f4c7b8..e0542c0375 100644 --- a/.github/workflows/waiting_for_feedback.yml +++ b/.github/workflows/waiting_for_feedback.yml @@ -20,8 +20,16 @@ jobs: with: script: | const commenter = context.payload.comment.user.login; + const commenterType = context.payload.comment.user.type; const label = 'waiting for feedback'; + // Ignore bots and CI accounts + const ignoredAccounts = ['tensorrt-cicd']; + if (commenterType === 'Bot' || ignoredAccounts.includes(commenter)) { + console.log(`Ignoring comment from ${commenter} (type: ${commenterType}). Skipping.`); + return; + } + // Handle both issue_comment and pull_request_review_comment events // context.issue.number is only available for issue_comment events const issueNumber = context.issue?.number || context.payload.pull_request?.number; From 710c592d7c2cbcd3f163f953f63c912fba1b8d3a Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:45:34 +0800 Subject: [PATCH 081/172] [https://nvbugs/5727517][fix] Preserve ip:port for disagg (#9859) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tensorrt_llm/commands/serve.py | 49 +++++++++++++--------- tensorrt_llm/serve/openai_disagg_server.py | 5 ++- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 716e27bda4..7e08295ade 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -635,29 +635,38 @@ def disaggregated( disagg_cfg = parse_disagg_config_file(config_file) - metadata_server_cfg = parse_metadata_server_config_file( - metadata_server_config_file) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind((disagg_cfg.hostname, disagg_cfg.port)) + except OSError as e: + raise RuntimeError( + f"Failed to bind socket to {disagg_cfg.hostname}:{disagg_cfg.port}: {e}" + ) - server = OpenAIDisaggServer(config=disagg_cfg, - req_timeout_secs=request_timeout, - server_start_timeout_secs=server_start_timeout, - metadata_server_cfg=metadata_server_cfg, - metrics_interval_secs=metrics_log_interval) + metadata_server_cfg = parse_metadata_server_config_file( + metadata_server_config_file) - # Disable GC by default - # When concurrency is high, the number of Python objects increases, so - # GC runs frequently and takes a long time to process. In this case, - # requests are not immediately forwarded to CTX workers and GEN workers, - # causing them to run with small batch sizes. Disabling GC can mitigate - # this problem. - # By testing this feature, we didn't observe significant RSS or VMS - # increment, and observed that `count0` (obtained by `gc.get_count()`) - # increases by fewer than 1,000 after every 200,000 requests, while the - # maximum value of `count0` exceeded 3,000,000 during the test. - if os.getenv("TRTLLM_DISAGG_SERVER_DISABLE_GC", "1") == "1": - gc.disable() + server = OpenAIDisaggServer( + config=disagg_cfg, + req_timeout_secs=request_timeout, + server_start_timeout_secs=server_start_timeout, + metadata_server_cfg=metadata_server_cfg, + metrics_interval_secs=metrics_log_interval) - asyncio.run(server(disagg_cfg.hostname, disagg_cfg.port)) + # Disable GC by default + # When concurrency is high, the number of Python objects increases, so + # GC runs frequently and takes a long time to process. In this case, + # requests are not immediately forwarded to CTX workers and GEN workers, + # causing them to run with small batch sizes. Disabling GC can mitigate + # this problem. + # By testing this feature, we didn't observe significant RSS or VMS + # increment, and observed that `count0` (obtained by `gc.get_count()`) + # increases by fewer than 1,000 after every 200,000 requests, while the + # maximum value of `count0` exceeded 3,000,000 during the test. + if os.getenv("TRTLLM_DISAGG_SERVER_DISABLE_GC", "1") == "1": + gc.disable() + + asyncio.run(server(disagg_cfg.hostname, disagg_cfg.port, sockets=[s])) def set_cuda_device(): diff --git a/tensorrt_llm/serve/openai_disagg_server.py b/tensorrt_llm/serve/openai_disagg_server.py index 55c3e136e5..524dd9fd11 100644 --- a/tensorrt_llm/serve/openai_disagg_server.py +++ b/tensorrt_llm/serve/openai_disagg_server.py @@ -16,6 +16,7 @@ # yapf: disable import asyncio import signal +import socket import traceback from contextlib import asynccontextmanager from typing import Callable, Optional @@ -190,13 +191,13 @@ class OpenAIDisaggServer: async def version(self) -> JSONResponse: return JSONResponse(content={"version": VERSION}) - async def __call__(self, host: str, port: int): + async def __call__(self, host: str, port: int, sockets: list[socket.socket] | None = None): config = uvicorn.Config(self.app, host=host, port=port, log_level=logger.level, timeout_keep_alive=TIMEOUT_KEEP_ALIVE) - await uvicorn.Server(config).serve() + await uvicorn.Server(config).serve(sockets=sockets) # TODO: rework this for service discovery, now it's only for static server list async def _set_steady_clock_offsets(self): From 4670e0c29785996fdf317fdde1cac03bb2525353 Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:49:46 +0800 Subject: [PATCH 082/172] [None][infra] update ucx to 1.20 (#9786) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> --- docker/common/install_ucx.sh | 2 +- jenkins/current_image_tags.properties | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh index 55da81e2c2..f20e99a52e 100644 --- a/docker/common/install_ucx.sh +++ b/docker/common/install_ucx.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -UCX_VERSION="v1.19.x" +UCX_VERSION="v1.20.x" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" UCX_REPO="https://github.com/openucx/ucx.git" diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index ed5f0078bd..dad998f814 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512091705-9823 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512110629-9786 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512110629-9786 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512110629-9786 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512110629-9786 From e8efeb765d7b2a23e123e80ed10dc7f98348e790 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:14:23 +0800 Subject: [PATCH 083/172] [TRTLLM-9717][fix] fix multi nodes tests cases (#9736) Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 3 ++- tests/integration/defs/test_e2e.py | 23 +++++++++++++------ .../test_lists/qa/llm_function_core.txt | 1 + .../test_lists/qa/llm_function_multinode.txt | 1 - 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index dbc991eb49..9c3b105ecd 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3293,7 +3293,8 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): True, True, True, - marks=pytest.mark.skip_less_mpi_world_size(8))], + marks=(pytest.mark.skip_less_mpi_world_size(8), + pytest.mark.timeout(7200)))], ids=["latency", "multi_gpus_no_cache"]) def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, is_cached): diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 183c7c1760..11dbcbd822 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -26,7 +26,7 @@ import pytest import yaml from defs.common import convert_weights from defs.trt_test_alternative import (check_call, check_call_negative_test, - check_output) + check_output, print_info, print_warning) from .common import (PluginOptions, convert_weights, get_mmlu_accuracy, prune_checkpoint, quantize_data, refit_model, @@ -3230,12 +3230,21 @@ def test_multi_nodes_eval(llm_venv, model_path, tp_size, pp_size, ep_size, run_cmd.extend([eval_task, f"--dataset_path={mmlu_dataset_root}"]) - llm_venv._new_env["TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL"] = "1" - output = check_output(" ".join(run_cmd), shell=True, env=llm_venv._new_env) - - if os.environ.get("SLURM_PROCID", '0') == '0': - mmlu_accuracy = get_mmlu_accuracy(output) - assert mmlu_accuracy > mmlu_threshold, f"MMLU accuracy {mmlu_accuracy} is less than threshold {mmlu_threshold}" + try: + # run the command with trtllm-llmapi-launch pytest wrapper + output = subprocess.check_output(run_cmd, + text=True, + stderr=subprocess.STDOUT, + timeout=7200) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + print_warning(f"eval failed: {e.returncode}") + print_warning(f"eval output:\n{e.output}") + raise + else: + if os.environ.get("SLURM_PROCID", '0') == '0': + print_info(f"eval output:\n{output}") + mmlu_accuracy = get_mmlu_accuracy(output) + assert mmlu_accuracy > mmlu_threshold, f"MMLU accuracy {mmlu_accuracy} is less than threshold {mmlu_threshold}" @pytest.mark.skip_less_device_memory(80000) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 71b73a530f..7facb002eb 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -508,6 +508,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] diff --git a/tests/integration/test_lists/qa/llm_function_multinode.txt b/tests/integration/test_lists/qa/llm_function_multinode.txt index 8a3958cf33..f2e3f8d216 100644 --- a/tests/integration/test_lists/qa/llm_function_multinode.txt +++ b/tests/integration/test_lists/qa/llm_function_multinode.txt @@ -11,4 +11,3 @@ test_e2e.py::test_multi_nodes_eval[Kimi-K2-Instruct-tp16-mmlu] test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp2pp1-gen_tp2pp1] test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180) From 5065b60cd1ef730d4b70aec86b5fa93b54485885 Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Fri, 12 Dec 2025 11:19:42 +0800 Subject: [PATCH 084/172] [None][infra] Fix mergeWaiveList stage (#9892) Signed-off-by: Yiqing Yan --- jenkins/L0_MergeRequest.groovy | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 234d94c690..deaa59fc26 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -605,6 +605,8 @@ def getMergeRequestChangedFileList(pipeline, globalVars) { } def getMergeRequestOneFileChanges(pipeline, globalVars, filePath) { + // Note: This function intentionally propagates exceptions to the caller. + // If there is an error to get the changed file diff, skip merging the waive list. def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/) if (env.alternativeTRT || isOfficialPostMergeJob) { pipeline.echo("Force set changed file diff to empty string.") @@ -614,20 +616,13 @@ def getMergeRequestOneFileChanges(pipeline, globalVars, filePath) { def githubPrApiUrl = globalVars[GITHUB_PR_API_URL] def diff = "" - try { - if (githubPrApiUrl != null) { - diff = getGithubMRChangedFile(pipeline, githubPrApiUrl, "getOneFileChanges", filePath) - } else { - diff = getGitlabMRChangedFile(pipeline, "getOneFileChanges", filePath) - } - pipeline.echo("The change of ${filePath} is: ${diff}") - return diff - } catch (InterruptedException e) { - throw e - } catch (Exception e) { - pipeline.echo("Get merge request one changed file diff failed. Error: ${e.toString()}") - return "" + if (githubPrApiUrl != null) { + diff = getGithubMRChangedFile(pipeline, githubPrApiUrl, "getOneFileChanges", filePath) + } else { + diff = getGitlabMRChangedFile(pipeline, "getOneFileChanges", filePath) } + pipeline.echo("The change of ${filePath} is: ${diff}") + return diff } def getAutoTriggerTagList(pipeline, testFilter, globalVars) { From 0132769c2222c3d0dcd419f1d8dfd51a03246971 Mon Sep 17 00:00:00 2001 From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> Date: Fri, 12 Dec 2025 03:20:40 +0000 Subject: [PATCH 085/172] [None][infra] Check in most recent lock file from nightly pipeline Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> --- security_scanning/docs/poetry.lock | 6 +- security_scanning/examples/apps/poetry.lock | 6 +- .../examples/auto_deploy/poetry.lock | 6 +- .../examples/draft_target_model/poetry.lock | 12 +- security_scanning/examples/eagle/poetry.lock | 12 +- .../llm-eval/lm-eval-harness/poetry.lock | 6 +- .../examples/lookahead/poetry.lock | 12 +- security_scanning/examples/medusa/poetry.lock | 12 +- .../models/contrib/baichuan/poetry.lock | 36 ++-- .../examples/models/contrib/bloom/poetry.lock | 12 +- .../models/contrib/chatglm-6b/poetry.lock | 12 +- .../models/contrib/chatglm2-6b/poetry.lock | 12 +- .../contrib/chatglm3-6b-32k/poetry.lock | 12 +- .../examples/models/contrib/dbrx/poetry.lock | 12 +- .../models/contrib/deepseek_v1/poetry.lock | 12 +- .../models/contrib/deepseek_v2/poetry.lock | 12 +- .../models/contrib/falcon/poetry.lock | 6 +- .../examples/models/contrib/gptj/poetry.lock | 12 +- .../models/contrib/gptneox/poetry.lock | 12 +- .../examples/models/contrib/grok/poetry.lock | 12 +- .../models/contrib/hyperclovax/poetry.lock | 6 +- .../models/contrib/internlm/poetry.lock | 12 +- .../examples/models/contrib/jais/poetry.lock | 12 +- .../examples/models/contrib/mmdit/poetry.lock | 6 +- .../examples/models/contrib/mpt/poetry.lock | 12 +- .../examples/models/contrib/opt/poetry.lock | 12 +- .../models/contrib/skywork/poetry.lock | 12 +- .../examples/models/contrib/smaug/poetry.lock | 12 +- .../examples/models/contrib/stdit/poetry.lock | 19 +- .../examples/models/core/commandr/poetry.lock | 12 +- .../examples/models/core/gemma/poetry.lock | 18 +- .../examples/models/core/glm-4-9b/poetry.lock | 12 +- .../examples/models/core/gpt/poetry.lock | 12 +- .../examples/models/core/llama/poetry.lock | 6 +- .../examples/models/core/mamba/poetry.lock | 6 +- .../examples/models/core/mixtral/poetry.lock | 6 +- .../examples/models/core/mllama/poetry.lock | 6 +- .../examples/models/core/nemotron/poetry.lock | 12 +- .../examples/models/core/phi/poetry.lock | 12 +- .../examples/models/core/qwen/poetry.lock | 172 +++++++++--------- .../examples/models/core/qwen/pyproject.toml | 2 +- .../models/core/qwen2audio/poetry.lock | 6 +- .../examples/models/core/qwenvl/poetry.lock | 150 +++++++-------- .../models/core/qwenvl/pyproject.toml | 2 +- .../models/core/recurrentgemma/poetry.lock | 6 +- .../examples/models/core/whisper/poetry.lock | 50 ++--- security_scanning/examples/ngram/poetry.lock | 12 +- .../examples/quantization/poetry.lock | 36 ++-- .../examples/ray_orchestrator/poetry.lock | 44 ++--- .../examples/redrafter/poetry.lock | 12 +- .../examples/trtllm-eval/poetry.lock | 6 +- security_scanning/metadata.json | 4 +- security_scanning/poetry.lock | 14 +- security_scanning/pyproject.toml | 2 +- .../tests/integration/defs/perf/poetry.lock | 114 ++++++------ .../integration/defs/perf/pyproject.toml | 2 +- security_scanning/triton_backend/poetry.lock | 6 +- 57 files changed, 540 insertions(+), 539 deletions(-) diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock index 8e633a2abd..ac1ce39f45 100644 --- a/security_scanning/docs/poetry.lock +++ b/security_scanning/docs/poetry.lock @@ -1195,13 +1195,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/apps/poetry.lock b/security_scanning/examples/apps/poetry.lock index ea9e651ebb..acd0af2a3e 100644 --- a/security_scanning/examples/apps/poetry.lock +++ b/security_scanning/examples/apps/poetry.lock @@ -263,13 +263,13 @@ files = [ [[package]] name = "openai" -version = "2.9.0" +version = "2.11.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.9" files = [ - {file = "openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad"}, - {file = "openai-2.9.0.tar.gz", hash = "sha256:b52ec65727fc8f1eed2fbc86c8eac0998900c7ef63aa2eb5c24b69717c56fa5f"}, + {file = "openai-2.11.0-py3-none-any.whl", hash = "sha256:21189da44d2e3d027b08c7a920ba4454b8b7d6d30ae7e64d9de11dbe946d4faa"}, + {file = "openai-2.11.0.tar.gz", hash = "sha256:b3da01d92eda31524930b6ec9d7167c535e843918d7ba8a76b1c38f1104f321e"}, ] [package.dependencies] diff --git a/security_scanning/examples/auto_deploy/poetry.lock b/security_scanning/examples/auto_deploy/poetry.lock index f41cf9682a..1ff1af731e 100644 --- a/security_scanning/examples/auto_deploy/poetry.lock +++ b/security_scanning/examples/auto_deploy/poetry.lock @@ -3624,13 +3624,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/draft_target_model/poetry.lock b/security_scanning/examples/draft_target_model/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/draft_target_model/poetry.lock +++ b/security_scanning/examples/draft_target_model/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/eagle/poetry.lock b/security_scanning/examples/eagle/poetry.lock index d885d2a20e..a47fe4162c 100644 --- a/security_scanning/examples/eagle/poetry.lock +++ b/security_scanning/examples/eagle/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1807,13 +1807,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock b/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock index 3463f2a104..70bae1549a 100644 --- a/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock +++ b/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock @@ -3262,13 +3262,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/lookahead/poetry.lock b/security_scanning/examples/lookahead/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/lookahead/poetry.lock +++ b/security_scanning/examples/lookahead/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/medusa/poetry.lock b/security_scanning/examples/medusa/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/medusa/poetry.lock +++ b/security_scanning/examples/medusa/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/baichuan/poetry.lock b/security_scanning/examples/models/contrib/baichuan/poetry.lock index 4f6a876470..40c6d1c314 100644 --- a/security_scanning/examples/models/contrib/baichuan/poetry.lock +++ b/security_scanning/examples/models/contrib/baichuan/poetry.lock @@ -781,13 +781,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1879,18 +1879,18 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "5.0.0rc0" +version = "5.0.0rc1" description = "Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training." optional = false python-versions = ">=3.10.0" files = [ - {file = "transformers-5.0.0rc0-py3-none-any.whl", hash = "sha256:1935f8b396891c93b8520d951d4385da1b1b778914e1d79ed151ddbd32d83a22"}, - {file = "transformers-5.0.0rc0.tar.gz", hash = "sha256:bb427000caa4a88943704f80448b2323ad8c6a2f4f13c1433e27d0a1f690c975"}, + {file = "transformers-5.0.0rc1-py3-none-any.whl", hash = "sha256:8b9604700769872cab4280dbcde201f557e93f72ee5a85c4592275ab4f15d330"}, + {file = "transformers-5.0.0rc1.tar.gz", hash = "sha256:1fdde557b96ef8ea277c45b8e0d558f1e167fe28a98593f4c4aec0277e335821"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=1.0.0,<2.0" +huggingface-hub = ">=1.2.1,<2.0" numpy = ">=1.17" packaging = ">=20.0" pyyaml = ">=5.1" @@ -1903,20 +1903,20 @@ typer-slim = "*" [package.extras] accelerate = ["accelerate (>=1.1.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "av", "codecarbon (>=2.8.1)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "av", "codecarbon (>=2.8.1)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] chat-template = ["jinja2 (>=3.1.0)", "jmespath (>=1.0.1)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=1.1.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] -dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] +dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] +dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] ftfy = ["ftfy"] -hf-xet = ["hf-xet"] +hf-xet = ["hf_xet"] hub-kernels = ["kernels (>=0.10.2,<0.11)"] integrations = ["kernels (>=0.10.2,<0.11)", "optuna", "ray[tune] (>=2.7.0)"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] @@ -1932,14 +1932,14 @@ sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] serving = ["accelerate (>=1.1.0)", "fastapi", "openai (>=1.98.0)", "pydantic (>=2)", "rich", "starlette", "torch (>=2.2)", "uvicorn"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] +testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] tiktoken = ["blobfile", "tiktoken"] timm = ["timm (!=1.0.18,<=1.0.19)"] tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"] torch = ["accelerate (>=1.1.0)", "torch (>=2.2)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=1.0.0,<2.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=1.2.1,<2.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -1998,13 +1998,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/bloom/poetry.lock b/security_scanning/examples/models/contrib/bloom/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/models/contrib/bloom/poetry.lock +++ b/security_scanning/examples/models/contrib/bloom/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock index e982d71c4b..afb3d9ddf4 100644 --- a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock index e982d71c4b..afb3d9ddf4 100644 --- a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock index e982d71c4b..afb3d9ddf4 100644 --- a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/dbrx/poetry.lock b/security_scanning/examples/models/contrib/dbrx/poetry.lock index 34f97eabd5..e6494c0704 100644 --- a/security_scanning/examples/models/contrib/dbrx/poetry.lock +++ b/security_scanning/examples/models/contrib/dbrx/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1805,13 +1805,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock index 6305bc2199..84666def73 100644 --- a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/falcon/poetry.lock b/security_scanning/examples/models/contrib/falcon/poetry.lock index 6263aae157..1be44e0473 100644 --- a/security_scanning/examples/models/contrib/falcon/poetry.lock +++ b/security_scanning/examples/models/contrib/falcon/poetry.lock @@ -1865,13 +1865,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/gptj/poetry.lock b/security_scanning/examples/models/contrib/gptj/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/contrib/gptj/poetry.lock +++ b/security_scanning/examples/models/contrib/gptj/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/gptneox/poetry.lock b/security_scanning/examples/models/contrib/gptneox/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/contrib/gptneox/poetry.lock +++ b/security_scanning/examples/models/contrib/gptneox/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/grok/poetry.lock b/security_scanning/examples/models/contrib/grok/poetry.lock index 2f119a2247..4d08baf665 100644 --- a/security_scanning/examples/models/contrib/grok/poetry.lock +++ b/security_scanning/examples/models/contrib/grok/poetry.lock @@ -881,13 +881,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -2718,13 +2718,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock index a5a69ee5b4..9b7fb57a32 100644 --- a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock +++ b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock @@ -290,13 +290,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/internlm/poetry.lock b/security_scanning/examples/models/contrib/internlm/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/models/contrib/internlm/poetry.lock +++ b/security_scanning/examples/models/contrib/internlm/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/jais/poetry.lock b/security_scanning/examples/models/contrib/jais/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/models/contrib/jais/poetry.lock +++ b/security_scanning/examples/models/contrib/jais/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/mmdit/poetry.lock b/security_scanning/examples/models/contrib/mmdit/poetry.lock index 70f96cbc56..23b843e70f 100644 --- a/security_scanning/examples/models/contrib/mmdit/poetry.lock +++ b/security_scanning/examples/models/contrib/mmdit/poetry.lock @@ -1027,13 +1027,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/mpt/poetry.lock b/security_scanning/examples/models/contrib/mpt/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/contrib/mpt/poetry.lock +++ b/security_scanning/examples/models/contrib/mpt/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/opt/poetry.lock b/security_scanning/examples/models/contrib/opt/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/contrib/opt/poetry.lock +++ b/security_scanning/examples/models/contrib/opt/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/skywork/poetry.lock b/security_scanning/examples/models/contrib/skywork/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/models/contrib/skywork/poetry.lock +++ b/security_scanning/examples/models/contrib/skywork/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/smaug/poetry.lock b/security_scanning/examples/models/contrib/smaug/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/models/contrib/smaug/poetry.lock +++ b/security_scanning/examples/models/contrib/smaug/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/contrib/stdit/poetry.lock b/security_scanning/examples/models/contrib/stdit/poetry.lock index 48d254621a..5e72560b56 100644 --- a/security_scanning/examples/models/contrib/stdit/poetry.lock +++ b/security_scanning/examples/models/contrib/stdit/poetry.lock @@ -166,14 +166,15 @@ test-tox-coverage = ["coverage (>=5.5)"] [[package]] name = "bitsandbytes" -version = "0.48.2" +version = "0.49.0" description = "k-bit optimizers and matrix multiplication routines." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" files = [ - {file = "bitsandbytes-0.48.2-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:defbfa374d93809de3811cd2bca6978d1d51ecaa39f5bdd2018e1394a4886603"}, - {file = "bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:cd289562cb7308ee2a707e6884fecca9bbbcfc9ec33a86df2a45e0779692c1a3"}, - {file = "bitsandbytes-0.48.2-py3-none-win_amd64.whl", hash = "sha256:a048c285eb6ff53a8d189880e9dfa421d2bfb54e8cab263311757cf5b742d865"}, + {file = "bitsandbytes-0.49.0-py3-none-macosx_14_0_arm64.whl", hash = "sha256:17d5b57e6d51b78bcfc07da0e93db061181b25bffabfafe101dd9b75c2710872"}, + {file = "bitsandbytes-0.49.0-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:7e69951b4d207a676986fce967544d9599f23518d0f09d478295996aeff377c2"}, + {file = "bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0c46cdef50b3174463b6bdf13715c9f1f00b360be3626e3c5d2f8d226af2cf3f"}, + {file = "bitsandbytes-0.49.0-py3-none-win_amd64.whl", hash = "sha256:57a327c6d65f7eda32eb8d416ef8e44d2415c2e7b4fdb735896abd04171ae696"}, ] [package.dependencies] @@ -183,7 +184,7 @@ torch = ">=2.3,<3" [package.extras] benchmark = ["matplotlib", "pandas"] -dev = ["bitsandbytes[test]", "build (>=1.0.0,<2)", "pre-commit (>=3.5.0,<4)", "ruff (==0.11.2)", "wheel (>=0.42,<1)"] +dev = ["bitsandbytes[test]", "build (>=1.0.0,<2)", "pre-commit (>=3.5.0,<4)", "ruff (>=0.14.3,<0.15.0)", "wheel (>=0.42,<1)"] docs = ["hf-doc-builder (==0.5.0)"] test = ["einops (>=0.8.0,<0.9.0)", "lion-pytorch (==0.2.3)", "pytest (>=8.3,<9.0)", "scipy (>=1.11.4,<2)", "transformers (>=4.30.1,<5)"] @@ -2194,13 +2195,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/commandr/poetry.lock b/security_scanning/examples/models/core/commandr/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/core/commandr/poetry.lock +++ b/security_scanning/examples/models/core/commandr/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/gemma/poetry.lock b/security_scanning/examples/models/core/gemma/poetry.lock index afbadc04e8..03d2582047 100644 --- a/security_scanning/examples/models/core/gemma/poetry.lock +++ b/security_scanning/examples/models/core/gemma/poetry.lock @@ -872,13 +872,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1524,13 +1524,13 @@ test = ["flax (>=0.5.3)", "scikit-learn", "scipy (>=1.7.1)"] [[package]] name = "orbax-checkpoint" -version = "0.11.30" +version = "0.11.31" description = "Orbax Checkpoint" optional = false python-versions = ">=3.10" files = [ - {file = "orbax_checkpoint-0.11.30-py3-none-any.whl", hash = "sha256:56b15d07af7a4ff655f18d219de850d86944b1552e5143e81f5b15480f240a46"}, - {file = "orbax_checkpoint-0.11.30.tar.gz", hash = "sha256:5395e9fc80b750ee3644ee19f969923c7e3c83369133da5ea256a86d9bb838a6"}, + {file = "orbax_checkpoint-0.11.31-py3-none-any.whl", hash = "sha256:b00e39cd61cbd6c7c78b091ccac0ed1bbf3cf7788e761618e7070761195bfcc0"}, + {file = "orbax_checkpoint-0.11.31.tar.gz", hash = "sha256:f021193a619782655798bc4a285f40612f6fe647ddeb303d1f49cdbc5645e319"}, ] [package.dependencies] @@ -2746,13 +2746,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/glm-4-9b/poetry.lock b/security_scanning/examples/models/core/glm-4-9b/poetry.lock index e982d71c4b..afb3d9ddf4 100644 --- a/security_scanning/examples/models/core/glm-4-9b/poetry.lock +++ b/security_scanning/examples/models/core/glm-4-9b/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1923,13 +1923,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/gpt/poetry.lock b/security_scanning/examples/models/core/gpt/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/models/core/gpt/poetry.lock +++ b/security_scanning/examples/models/core/gpt/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/llama/poetry.lock b/security_scanning/examples/models/core/llama/poetry.lock index fe7a7444fe..221204bc96 100644 --- a/security_scanning/examples/models/core/llama/poetry.lock +++ b/security_scanning/examples/models/core/llama/poetry.lock @@ -1865,13 +1865,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/mamba/poetry.lock b/security_scanning/examples/models/core/mamba/poetry.lock index d064087a64..a1d4fd8d28 100644 --- a/security_scanning/examples/models/core/mamba/poetry.lock +++ b/security_scanning/examples/models/core/mamba/poetry.lock @@ -1865,13 +1865,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/mixtral/poetry.lock b/security_scanning/examples/models/core/mixtral/poetry.lock index 35127a881e..bed9324f3e 100644 --- a/security_scanning/examples/models/core/mixtral/poetry.lock +++ b/security_scanning/examples/models/core/mixtral/poetry.lock @@ -1304,13 +1304,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock index 3bd2e8468d..10398de921 100644 --- a/security_scanning/examples/models/core/mllama/poetry.lock +++ b/security_scanning/examples/models/core/mllama/poetry.lock @@ -1800,13 +1800,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/nemotron/poetry.lock b/security_scanning/examples/models/core/nemotron/poetry.lock index ad690d14e4..e30b8b936c 100644 --- a/security_scanning/examples/models/core/nemotron/poetry.lock +++ b/security_scanning/examples/models/core/nemotron/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1753,13 +1753,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/phi/poetry.lock b/security_scanning/examples/models/core/phi/poetry.lock index 7fc49d12c8..a5ab5081cd 100644 --- a/security_scanning/examples/models/core/phi/poetry.lock +++ b/security_scanning/examples/models/core/phi/poetry.lock @@ -782,13 +782,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1816,13 +1816,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock index 8ece45a078..4bbf2cccc1 100644 --- a/security_scanning/examples/models/core/qwen/poetry.lock +++ b/security_scanning/examples/models/core/qwen/poetry.lock @@ -652,13 +652,13 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.124.0" +version = "0.124.2" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" files = [ - {file = "fastapi-0.124.0-py3-none-any.whl", hash = "sha256:91596bdc6dde303c318f06e8d2bc75eafb341fc793a0c9c92c0bc1db1ac52480"}, - {file = "fastapi-0.124.0.tar.gz", hash = "sha256:260cd178ad75e6d259991f2fd9b0fee924b224850079df576a3ba604ce58f4e6"}, + {file = "fastapi-0.124.2-py3-none-any.whl", hash = "sha256:6314385777a507bb19b34bd064829fddaea0eea54436deb632b5de587554055c"}, + {file = "fastapi-0.124.2.tar.gz", hash = "sha256:72e188f01f360e2f59da51c8822cbe4bca210c35daaae6321b1b724109101c00"}, ] [package.dependencies] @@ -1542,66 +1542,66 @@ files = [ [[package]] name = "matplotlib" -version = "3.10.7" +version = "3.10.8" description = "Python plotting package" optional = false python-versions = ">=3.10" files = [ - {file = "matplotlib-3.10.7-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7ac81eee3b7c266dd92cee1cd658407b16c57eed08c7421fa354ed68234de380"}, - {file = "matplotlib-3.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:667ecd5d8d37813a845053d8f5bf110b534c3c9f30e69ebd25d4701385935a6d"}, - {file = "matplotlib-3.10.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc1c51b846aca49a5a8b44fbba6a92d583a35c64590ad9e1e950dc88940a4297"}, - {file = "matplotlib-3.10.7-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a11c2e9e72e7de09b7b72e62f3df23317c888299c875e2b778abf1eda8c0a42"}, - {file = "matplotlib-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f19410b486fdd139885ace124e57f938c1e6a3210ea13dd29cab58f5d4bc12c7"}, - {file = "matplotlib-3.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:b498e9e4022f93de2d5a37615200ca01297ceebbb56fe4c833f46862a490f9e3"}, - {file = "matplotlib-3.10.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:53b492410a6cd66c7a471de6c924f6ede976e963c0f3097a3b7abfadddc67d0a"}, - {file = "matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9749313deb729f08207718d29c86246beb2ea3fdba753595b55901dee5d2fd6"}, - {file = "matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2222c7ba2cbde7fe63032769f6eb7e83ab3227f47d997a8453377709b7fe3a5a"}, - {file = "matplotlib-3.10.7-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e91f61a064c92c307c5a9dc8c05dc9f8a68f0a3be199d9a002a0622e13f874a1"}, - {file = "matplotlib-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f1851eab59ca082c95df5a500106bad73672645625e04538b3ad0f69471ffcc"}, - {file = "matplotlib-3.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:6516ce375109c60ceec579e699524e9d504cd7578506f01150f7a6bc174a775e"}, - {file = "matplotlib-3.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:b172db79759f5f9bc13ef1c3ef8b9ee7b37b0247f987fbbbdaa15e4f87fd46a9"}, - {file = "matplotlib-3.10.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a0edb7209e21840e8361e91ea84ea676658aa93edd5f8762793dec77a4a6748"}, - {file = "matplotlib-3.10.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c380371d3c23e0eadf8ebff114445b9f970aff2010198d498d4ab4c3b41eea4f"}, - {file = "matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5f256d49fea31f40f166a5e3131235a5d2f4b7f44520b1cf0baf1ce568ccff0"}, - {file = "matplotlib-3.10.7-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11ae579ac83cdf3fb72573bb89f70e0534de05266728740d478f0f818983c695"}, - {file = "matplotlib-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4c14b6acd16cddc3569a2d515cfdd81c7a68ac5639b76548cfc1a9e48b20eb65"}, - {file = "matplotlib-3.10.7-cp312-cp312-win_amd64.whl", hash = "sha256:0d8c32b7ea6fb80b1aeff5a2ceb3fb9778e2759e899d9beff75584714afcc5ee"}, - {file = "matplotlib-3.10.7-cp312-cp312-win_arm64.whl", hash = "sha256:5f3f6d315dcc176ba7ca6e74c7768fb7e4cf566c49cb143f6bc257b62e634ed8"}, - {file = "matplotlib-3.10.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1d9d3713a237970569156cfb4de7533b7c4eacdd61789726f444f96a0d28f57f"}, - {file = "matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37a1fea41153dd6ee061d21ab69c9cf2cf543160b1b85d89cd3d2e2a7902ca4c"}, - {file = "matplotlib-3.10.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b3c4ea4948d93c9c29dc01c0c23eef66f2101bf75158c291b88de6525c55c3d1"}, - {file = "matplotlib-3.10.7-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22df30ffaa89f6643206cf13877191c63a50e8f800b038bc39bee9d2d4957632"}, - {file = "matplotlib-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b69676845a0a66f9da30e87f48be36734d6748024b525ec4710be40194282c84"}, - {file = "matplotlib-3.10.7-cp313-cp313-win_amd64.whl", hash = "sha256:744991e0cc863dd669c8dc9136ca4e6e0082be2070b9d793cbd64bec872a6815"}, - {file = "matplotlib-3.10.7-cp313-cp313-win_arm64.whl", hash = "sha256:fba2974df0bf8ce3c995fa84b79cde38326e0f7b5409e7a3a481c1141340bcf7"}, - {file = "matplotlib-3.10.7-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:932c55d1fa7af4423422cb6a492a31cbcbdbe68fd1a9a3f545aa5e7a143b5355"}, - {file = "matplotlib-3.10.7-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e38c2d581d62ee729a6e144c47a71b3f42fb4187508dbbf4fe71d5612c3433b"}, - {file = "matplotlib-3.10.7-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:786656bb13c237bbcebcd402f65f44dd61ead60ee3deb045af429d889c8dbc67"}, - {file = "matplotlib-3.10.7-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09d7945a70ea43bf9248f4b6582734c2fe726723204a76eca233f24cffc7ef67"}, - {file = "matplotlib-3.10.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d0b181e9fa8daf1d9f2d4c547527b167cb8838fc587deabca7b5c01f97199e84"}, - {file = "matplotlib-3.10.7-cp313-cp313t-win_amd64.whl", hash = "sha256:31963603041634ce1a96053047b40961f7a29eb8f9a62e80cc2c0427aa1d22a2"}, - {file = "matplotlib-3.10.7-cp313-cp313t-win_arm64.whl", hash = "sha256:aebed7b50aa6ac698c90f60f854b47e48cd2252b30510e7a1feddaf5a3f72cbf"}, - {file = "matplotlib-3.10.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d883460c43e8c6b173fef244a2341f7f7c0e9725c7fe68306e8e44ed9c8fb100"}, - {file = "matplotlib-3.10.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07124afcf7a6504eafcb8ce94091c5898bbdd351519a1beb5c45f7a38c67e77f"}, - {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c17398b709a6cce3d9fdb1595c33e356d91c098cd9486cb2cc21ea2ea418e715"}, - {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7146d64f561498764561e9cd0ed64fcf582e570fc519e6f521e2d0cfd43365e1"}, - {file = "matplotlib-3.10.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:90ad854c0a435da3104c01e2c6f0028d7e719b690998a2333d7218db80950722"}, - {file = "matplotlib-3.10.7-cp314-cp314-win_amd64.whl", hash = "sha256:4645fc5d9d20ffa3a39361fcdbcec731382763b623b72627806bf251b6388866"}, - {file = "matplotlib-3.10.7-cp314-cp314-win_arm64.whl", hash = "sha256:9257be2f2a03415f9105c486d304a321168e61ad450f6153d77c69504ad764bb"}, - {file = "matplotlib-3.10.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1e4bbad66c177a8fdfa53972e5ef8be72a5f27e6a607cec0d8579abd0f3102b1"}, - {file = "matplotlib-3.10.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d8eb7194b084b12feb19142262165832fc6ee879b945491d1c3d4660748020c4"}, - {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4d41379b05528091f00e1728004f9a8d7191260f3862178b88e8fd770206318"}, - {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a74f79fafb2e177f240579bc83f0b60f82cc47d2f1d260f422a0627207008ca"}, - {file = "matplotlib-3.10.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:702590829c30aada1e8cef0568ddbffa77ca747b4d6e36c6d173f66e301f89cc"}, - {file = "matplotlib-3.10.7-cp314-cp314t-win_amd64.whl", hash = "sha256:f79d5de970fc90cd5591f60053aecfce1fcd736e0303d9f0bf86be649fa68fb8"}, - {file = "matplotlib-3.10.7-cp314-cp314t-win_arm64.whl", hash = "sha256:cb783436e47fcf82064baca52ce748af71725d0352e1d31564cbe9c95df92b9c"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5c09cf8f2793f81368f49f118b6f9f937456362bee282eac575cca7f84cda537"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:de66744b2bb88d5cd27e80dfc2ec9f0517d0a46d204ff98fe9e5f2864eb67657"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53cc80662dd197ece414dd5b66e07370201515a3eaf52e7c518c68c16814773b"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15112bcbaef211bd663fa935ec33313b948e214454d949b723998a43357b17b0"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2a959c640cdeecdd2ec3136e8ea0441da59bcaf58d67e9c590740addba2cb68"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3886e47f64611046bc1db523a09dd0a0a6bed6081e6f90e13806dd1d1d1b5e91"}, - {file = "matplotlib-3.10.7.tar.gz", hash = "sha256:a06ba7e2a2ef9131c79c49e63dad355d2d878413a0376c1727c8b9335ff731c7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"}, + {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"}, + {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"}, + {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"}, + {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"}, + {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"}, + {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"}, + {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"}, + {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"}, + {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"}, ] [package.dependencies] @@ -1917,13 +1917,13 @@ files = [ [[package]] name = "openai" -version = "2.9.0" +version = "2.11.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.9" files = [ - {file = "openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad"}, - {file = "openai-2.9.0.tar.gz", hash = "sha256:b52ec65727fc8f1eed2fbc86c8eac0998900c7ef63aa2eb5c24b69717c56fa5f"}, + {file = "openai-2.11.0-py3-none-any.whl", hash = "sha256:21189da44d2e3d027b08c7a920ba4454b8b7d6d30ae7e64d9de11dbe946d4faa"}, + {file = "openai-2.11.0.tar.gz", hash = "sha256:b3da01d92eda31524930b6ec9d7167c535e843918d7ba8a76b1c38f1104f321e"}, ] [package.dependencies] @@ -2927,30 +2927,30 @@ six = ">=1.14.0" [[package]] name = "ruff" -version = "0.14.8" +version = "0.14.9" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.14.8-py3-none-linux_armv6l.whl", hash = "sha256:ec071e9c82eca417f6111fd39f7043acb53cd3fde9b1f95bbed745962e345afb"}, - {file = "ruff-0.14.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:8cdb162a7159f4ca36ce980a18c43d8f036966e7f73f866ac8f493b75e0c27e9"}, - {file = "ruff-0.14.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2e2fcbefe91f9fad0916850edf0854530c15bd1926b6b779de47e9ab619ea38f"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9d70721066a296f45786ec31916dc287b44040f553da21564de0ab4d45a869b"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2c87e09b3cd9d126fc67a9ecd3b5b1d3ded2b9c7fce3f16e315346b9d05cfb52"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d62cb310c4fbcb9ee4ac023fe17f984ae1e12b8a4a02e3d21489f9a2a5f730c"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1af35c2d62633d4da0521178e8a2641c636d2a7153da0bac1b30cfd4ccd91344"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25add4575ffecc53d60eed3f24b1e934493631b48ebbc6ebaf9d8517924aca4b"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4c943d847b7f02f7db4201a0600ea7d244d8a404fbb639b439e987edcf2baf9a"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb6e8bf7b4f627548daa1b69283dac5a296bfe9ce856703b03130732e20ddfe2"}, - {file = "ruff-0.14.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:7aaf2974f378e6b01d1e257c6948207aec6a9b5ba53fab23d0182efb887a0e4a"}, - {file = "ruff-0.14.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e5758ca513c43ad8a4ef13f0f081f80f08008f410790f3611a21a92421ab045b"}, - {file = "ruff-0.14.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f74f7ba163b6e85a8d81a590363bf71618847e5078d90827749bfda1d88c9cdf"}, - {file = "ruff-0.14.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:eed28f6fafcc9591994c42254f5a5c5ca40e69a30721d2ab18bb0bb3baac3ab6"}, - {file = "ruff-0.14.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:21d48fa744c9d1cb8d71eb0a740c4dd02751a5de9db9a730a8ef75ca34cf138e"}, - {file = "ruff-0.14.8-py3-none-win32.whl", hash = "sha256:15f04cb45c051159baebb0f0037f404f1dc2f15a927418f29730f411a79bc4e7"}, - {file = "ruff-0.14.8-py3-none-win_amd64.whl", hash = "sha256:9eeb0b24242b5bbff3011409a739929f497f3fb5fe3b5698aba5e77e8c833097"}, - {file = "ruff-0.14.8-py3-none-win_arm64.whl", hash = "sha256:965a582c93c63fe715fd3e3f8aa37c4b776777203d8e1d8aa3cc0c14424a4b99"}, - {file = "ruff-0.14.8.tar.gz", hash = "sha256:774ed0dd87d6ce925e3b8496feb3a00ac564bea52b9feb551ecd17e0a23d1eed"}, + {file = "ruff-0.14.9-py3-none-linux_armv6l.whl", hash = "sha256:f1ec5de1ce150ca6e43691f4a9ef5c04574ad9ca35c8b3b0e18877314aba7e75"}, + {file = "ruff-0.14.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ed9d7417a299fc6030b4f26333bf1117ed82a61ea91238558c0268c14e00d0c2"}, + {file = "ruff-0.14.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d5dc3473c3f0e4a1008d0ef1d75cee24a48e254c8bed3a7afdd2b4392657ed2c"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84bf7c698fc8f3cb8278830fb6b5a47f9bcc1ed8cb4f689b9dd02698fa840697"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aa733093d1f9d88a5d98988d8834ef5d6f9828d03743bf5e338bf980a19fce27"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a1cfb04eda979b20c8c19550c8b5f498df64ff8da151283311ce3199e8b3648"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1e5cb521e5ccf0008bd74d5595a4580313844a42b9103b7388eca5a12c970743"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd429a8926be6bba4befa8cdcf3f4dd2591c413ea5066b1e99155ed245ae42bb"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab208c1b7a492e37caeaf290b1378148f75e13c2225af5d44628b95fd7834273"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72034534e5b11e8a593f517b2f2f2b273eb68a30978c6a2d40473ad0aaa4cb4a"}, + {file = "ruff-0.14.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:712ff04f44663f1b90a1195f51525836e3413c8a773574a7b7775554269c30ed"}, + {file = "ruff-0.14.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a111fee1db6f1d5d5810245295527cda1d367c5aa8f42e0fca9a78ede9b4498b"}, + {file = "ruff-0.14.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8769efc71558fecc25eb295ddec7d1030d41a51e9dcf127cbd63ec517f22d567"}, + {file = "ruff-0.14.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:347e3bf16197e8a2de17940cd75fd6491e25c0aa7edf7d61aa03f146a1aa885a"}, + {file = "ruff-0.14.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7715d14e5bccf5b660f54516558aa94781d3eb0838f8e706fb60e3ff6eff03a8"}, + {file = "ruff-0.14.9-py3-none-win32.whl", hash = "sha256:df0937f30aaabe83da172adaf8937003ff28172f59ca9f17883b4213783df197"}, + {file = "ruff-0.14.9-py3-none-win_amd64.whl", hash = "sha256:c0b53a10e61df15a42ed711ec0bda0c582039cf6c754c49c020084c55b5b0bc2"}, + {file = "ruff-0.14.9-py3-none-win_arm64.whl", hash = "sha256:8e821c366517a074046d92f0e9213ed1c13dbc5b37a7fc20b07f79b64d62cc84"}, + {file = "ruff-0.14.9.tar.gz", hash = "sha256:35f85b25dd586381c0cc053f48826109384c81c00ad7ef1bd977bfcc28119d5b"}, ] [[package]] @@ -3442,13 +3442,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] @@ -3853,4 +3853,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "4742452eb060af79be205f6082f359475e15870d9a417a8cc63f994f4883fa64" +content-hash = "f9d556fd9b6533e03dd294f1565d5ce832f90be74dc3bd849abe79c511568d76" diff --git a/security_scanning/examples/models/core/qwen/pyproject.toml b/security_scanning/examples/models/core/qwen/pyproject.toml index 806dee9f56..d72d622ef9 100644 --- a/security_scanning/examples/models/core/qwen/pyproject.toml +++ b/security_scanning/examples/models/core/qwen/pyproject.toml @@ -19,7 +19,7 @@ gradio = "4.44.1" mdtex2html = "^1.3.2" sse-starlette = "^3.0.3" aiohttp-sse-client = "^0.2.1" -openai = "^2.9.0" +openai = "^2.11.0" [build-system] diff --git a/security_scanning/examples/models/core/qwen2audio/poetry.lock b/security_scanning/examples/models/core/qwen2audio/poetry.lock index 91bc119d78..48e847b41e 100644 --- a/security_scanning/examples/models/core/qwen2audio/poetry.lock +++ b/security_scanning/examples/models/core/qwen2audio/poetry.lock @@ -1962,13 +1962,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwenvl/poetry.lock b/security_scanning/examples/models/core/qwenvl/poetry.lock index f4a26d49cc..9c7ccf40af 100644 --- a/security_scanning/examples/models/core/qwenvl/poetry.lock +++ b/security_scanning/examples/models/core/qwenvl/poetry.lock @@ -945,13 +945,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1230,66 +1230,66 @@ files = [ [[package]] name = "matplotlib" -version = "3.10.7" +version = "3.10.8" description = "Python plotting package" optional = false python-versions = ">=3.10" files = [ - {file = "matplotlib-3.10.7-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7ac81eee3b7c266dd92cee1cd658407b16c57eed08c7421fa354ed68234de380"}, - {file = "matplotlib-3.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:667ecd5d8d37813a845053d8f5bf110b534c3c9f30e69ebd25d4701385935a6d"}, - {file = "matplotlib-3.10.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc1c51b846aca49a5a8b44fbba6a92d583a35c64590ad9e1e950dc88940a4297"}, - {file = "matplotlib-3.10.7-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a11c2e9e72e7de09b7b72e62f3df23317c888299c875e2b778abf1eda8c0a42"}, - {file = "matplotlib-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f19410b486fdd139885ace124e57f938c1e6a3210ea13dd29cab58f5d4bc12c7"}, - {file = "matplotlib-3.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:b498e9e4022f93de2d5a37615200ca01297ceebbb56fe4c833f46862a490f9e3"}, - {file = "matplotlib-3.10.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:53b492410a6cd66c7a471de6c924f6ede976e963c0f3097a3b7abfadddc67d0a"}, - {file = "matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9749313deb729f08207718d29c86246beb2ea3fdba753595b55901dee5d2fd6"}, - {file = "matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2222c7ba2cbde7fe63032769f6eb7e83ab3227f47d997a8453377709b7fe3a5a"}, - {file = "matplotlib-3.10.7-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e91f61a064c92c307c5a9dc8c05dc9f8a68f0a3be199d9a002a0622e13f874a1"}, - {file = "matplotlib-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f1851eab59ca082c95df5a500106bad73672645625e04538b3ad0f69471ffcc"}, - {file = "matplotlib-3.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:6516ce375109c60ceec579e699524e9d504cd7578506f01150f7a6bc174a775e"}, - {file = "matplotlib-3.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:b172db79759f5f9bc13ef1c3ef8b9ee7b37b0247f987fbbbdaa15e4f87fd46a9"}, - {file = "matplotlib-3.10.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a0edb7209e21840e8361e91ea84ea676658aa93edd5f8762793dec77a4a6748"}, - {file = "matplotlib-3.10.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c380371d3c23e0eadf8ebff114445b9f970aff2010198d498d4ab4c3b41eea4f"}, - {file = "matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5f256d49fea31f40f166a5e3131235a5d2f4b7f44520b1cf0baf1ce568ccff0"}, - {file = "matplotlib-3.10.7-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11ae579ac83cdf3fb72573bb89f70e0534de05266728740d478f0f818983c695"}, - {file = "matplotlib-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4c14b6acd16cddc3569a2d515cfdd81c7a68ac5639b76548cfc1a9e48b20eb65"}, - {file = "matplotlib-3.10.7-cp312-cp312-win_amd64.whl", hash = "sha256:0d8c32b7ea6fb80b1aeff5a2ceb3fb9778e2759e899d9beff75584714afcc5ee"}, - {file = "matplotlib-3.10.7-cp312-cp312-win_arm64.whl", hash = "sha256:5f3f6d315dcc176ba7ca6e74c7768fb7e4cf566c49cb143f6bc257b62e634ed8"}, - {file = "matplotlib-3.10.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1d9d3713a237970569156cfb4de7533b7c4eacdd61789726f444f96a0d28f57f"}, - {file = "matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37a1fea41153dd6ee061d21ab69c9cf2cf543160b1b85d89cd3d2e2a7902ca4c"}, - {file = "matplotlib-3.10.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b3c4ea4948d93c9c29dc01c0c23eef66f2101bf75158c291b88de6525c55c3d1"}, - {file = "matplotlib-3.10.7-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22df30ffaa89f6643206cf13877191c63a50e8f800b038bc39bee9d2d4957632"}, - {file = "matplotlib-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b69676845a0a66f9da30e87f48be36734d6748024b525ec4710be40194282c84"}, - {file = "matplotlib-3.10.7-cp313-cp313-win_amd64.whl", hash = "sha256:744991e0cc863dd669c8dc9136ca4e6e0082be2070b9d793cbd64bec872a6815"}, - {file = "matplotlib-3.10.7-cp313-cp313-win_arm64.whl", hash = "sha256:fba2974df0bf8ce3c995fa84b79cde38326e0f7b5409e7a3a481c1141340bcf7"}, - {file = "matplotlib-3.10.7-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:932c55d1fa7af4423422cb6a492a31cbcbdbe68fd1a9a3f545aa5e7a143b5355"}, - {file = "matplotlib-3.10.7-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e38c2d581d62ee729a6e144c47a71b3f42fb4187508dbbf4fe71d5612c3433b"}, - {file = "matplotlib-3.10.7-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:786656bb13c237bbcebcd402f65f44dd61ead60ee3deb045af429d889c8dbc67"}, - {file = "matplotlib-3.10.7-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09d7945a70ea43bf9248f4b6582734c2fe726723204a76eca233f24cffc7ef67"}, - {file = "matplotlib-3.10.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d0b181e9fa8daf1d9f2d4c547527b167cb8838fc587deabca7b5c01f97199e84"}, - {file = "matplotlib-3.10.7-cp313-cp313t-win_amd64.whl", hash = "sha256:31963603041634ce1a96053047b40961f7a29eb8f9a62e80cc2c0427aa1d22a2"}, - {file = "matplotlib-3.10.7-cp313-cp313t-win_arm64.whl", hash = "sha256:aebed7b50aa6ac698c90f60f854b47e48cd2252b30510e7a1feddaf5a3f72cbf"}, - {file = "matplotlib-3.10.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d883460c43e8c6b173fef244a2341f7f7c0e9725c7fe68306e8e44ed9c8fb100"}, - {file = "matplotlib-3.10.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07124afcf7a6504eafcb8ce94091c5898bbdd351519a1beb5c45f7a38c67e77f"}, - {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c17398b709a6cce3d9fdb1595c33e356d91c098cd9486cb2cc21ea2ea418e715"}, - {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7146d64f561498764561e9cd0ed64fcf582e570fc519e6f521e2d0cfd43365e1"}, - {file = "matplotlib-3.10.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:90ad854c0a435da3104c01e2c6f0028d7e719b690998a2333d7218db80950722"}, - {file = "matplotlib-3.10.7-cp314-cp314-win_amd64.whl", hash = "sha256:4645fc5d9d20ffa3a39361fcdbcec731382763b623b72627806bf251b6388866"}, - {file = "matplotlib-3.10.7-cp314-cp314-win_arm64.whl", hash = "sha256:9257be2f2a03415f9105c486d304a321168e61ad450f6153d77c69504ad764bb"}, - {file = "matplotlib-3.10.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1e4bbad66c177a8fdfa53972e5ef8be72a5f27e6a607cec0d8579abd0f3102b1"}, - {file = "matplotlib-3.10.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d8eb7194b084b12feb19142262165832fc6ee879b945491d1c3d4660748020c4"}, - {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4d41379b05528091f00e1728004f9a8d7191260f3862178b88e8fd770206318"}, - {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a74f79fafb2e177f240579bc83f0b60f82cc47d2f1d260f422a0627207008ca"}, - {file = "matplotlib-3.10.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:702590829c30aada1e8cef0568ddbffa77ca747b4d6e36c6d173f66e301f89cc"}, - {file = "matplotlib-3.10.7-cp314-cp314t-win_amd64.whl", hash = "sha256:f79d5de970fc90cd5591f60053aecfce1fcd736e0303d9f0bf86be649fa68fb8"}, - {file = "matplotlib-3.10.7-cp314-cp314t-win_arm64.whl", hash = "sha256:cb783436e47fcf82064baca52ce748af71725d0352e1d31564cbe9c95df92b9c"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5c09cf8f2793f81368f49f118b6f9f937456362bee282eac575cca7f84cda537"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:de66744b2bb88d5cd27e80dfc2ec9f0517d0a46d204ff98fe9e5f2864eb67657"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53cc80662dd197ece414dd5b66e07370201515a3eaf52e7c518c68c16814773b"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15112bcbaef211bd663fa935ec33313b948e214454d949b723998a43357b17b0"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2a959c640cdeecdd2ec3136e8ea0441da59bcaf58d67e9c590740addba2cb68"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3886e47f64611046bc1db523a09dd0a0a6bed6081e6f90e13806dd1d1d1b5e91"}, - {file = "matplotlib-3.10.7.tar.gz", hash = "sha256:a06ba7e2a2ef9131c79c49e63dad355d2d878413a0376c1727c8b9335ff731c7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"}, + {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"}, + {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"}, + {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"}, + {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"}, + {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"}, + {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"}, + {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"}, + {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"}, + {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"}, ] [package.dependencies] @@ -2918,18 +2918,18 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "5.0.0rc0" +version = "5.0.0rc1" description = "Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training." optional = false python-versions = ">=3.10.0" files = [ - {file = "transformers-5.0.0rc0-py3-none-any.whl", hash = "sha256:1935f8b396891c93b8520d951d4385da1b1b778914e1d79ed151ddbd32d83a22"}, - {file = "transformers-5.0.0rc0.tar.gz", hash = "sha256:bb427000caa4a88943704f80448b2323ad8c6a2f4f13c1433e27d0a1f690c975"}, + {file = "transformers-5.0.0rc1-py3-none-any.whl", hash = "sha256:8b9604700769872cab4280dbcde201f557e93f72ee5a85c4592275ab4f15d330"}, + {file = "transformers-5.0.0rc1.tar.gz", hash = "sha256:1fdde557b96ef8ea277c45b8e0d558f1e167fe28a98593f4c4aec0277e335821"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=1.0.0,<2.0" +huggingface-hub = ">=1.2.1,<2.0" numpy = ">=1.17" packaging = ">=20.0" pyyaml = ">=5.1" @@ -2942,20 +2942,20 @@ typer-slim = "*" [package.extras] accelerate = ["accelerate (>=1.1.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "av", "codecarbon (>=2.8.1)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "av", "codecarbon (>=2.8.1)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] chat-template = ["jinja2 (>=3.1.0)", "jmespath (>=1.0.1)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=1.1.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] -dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] +dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] +dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] ftfy = ["ftfy"] -hf-xet = ["hf-xet"] +hf-xet = ["hf_xet"] hub-kernels = ["kernels (>=0.10.2,<0.11)"] integrations = ["kernels (>=0.10.2,<0.11)", "optuna", "ray[tune] (>=2.7.0)"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] @@ -2971,14 +2971,14 @@ sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] serving = ["accelerate (>=1.1.0)", "fastapi", "openai (>=1.98.0)", "pydantic (>=2)", "rich", "starlette", "torch (>=2.2)", "uvicorn"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] +testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] tiktoken = ["blobfile", "tiktoken"] timm = ["timm (!=1.0.18,<=1.0.19)"] tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"] torch = ["accelerate (>=1.1.0)", "torch (>=2.2)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=1.0.0,<2.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=1.2.1,<2.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -3065,13 +3065,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] @@ -3376,4 +3376,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "1625851fb799990cd6afbe92be0f7cdfa87a2bc3f161ad51ecdc6aa2ff250630" +content-hash = "8e132d98c6eeeb321a2a7ce259a8cfb2edcea6e965c33cd9927d02847540f452" diff --git a/security_scanning/examples/models/core/qwenvl/pyproject.toml b/security_scanning/examples/models/core/qwenvl/pyproject.toml index 8765744ba4..f975fd14fb 100644 --- a/security_scanning/examples/models/core/qwenvl/pyproject.toml +++ b/security_scanning/examples/models/core/qwenvl/pyproject.toml @@ -14,7 +14,7 @@ transformers-stream-generator = "^0.0.5" sentencepiece = ">=0.1.99" tiktoken = "^0.12.0" einops = "^0.8.1" -matplotlib = "^3.10.7" +matplotlib = "^3.10.8" torchvision = "^0.24.1" diff --git a/security_scanning/examples/models/core/recurrentgemma/poetry.lock b/security_scanning/examples/models/core/recurrentgemma/poetry.lock index 286080432a..8187e11a1f 100644 --- a/security_scanning/examples/models/core/recurrentgemma/poetry.lock +++ b/security_scanning/examples/models/core/recurrentgemma/poetry.lock @@ -2506,13 +2506,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/whisper/poetry.lock b/security_scanning/examples/models/core/whisper/poetry.lock index abac1b6c4b..a39420fe59 100644 --- a/security_scanning/examples/models/core/whisper/poetry.lock +++ b/security_scanning/examples/models/core/whisper/poetry.lock @@ -1343,32 +1343,32 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] name = "numba" -version = "0.63.0" +version = "0.63.1" description = "compiling Python code using LLVM" optional = false python-versions = ">=3.10" files = [ - {file = "numba-0.63.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:09534fd6e7a08a2b26c36449e62120563ed548c91c5ec5e00b10ecac8fb86460"}, - {file = "numba-0.63.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ff6ad346bf5010a02fedec7e7161947109cc45bedfacdad609d1d7d7aec34426"}, - {file = "numba-0.63.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91908f74c64ec0a2006a53b01bffd088d3289d403b063584197974fabc431aff"}, - {file = "numba-0.63.0-cp310-cp310-win_amd64.whl", hash = "sha256:d900bee63b2546352f3bbb533beb74f9825c8f58afb80632625a2d9606ea56af"}, - {file = "numba-0.63.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94ae468e1a1ff9b6f8b8e6920caa46f353bed7c088077912a310f737966147cb"}, - {file = "numba-0.63.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a801fff99a7ccdd79405b061f9b624234b84263c5e2a5a38408e8fb19fc3a243"}, - {file = "numba-0.63.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0627914cc99e4b1ed386b74f81c819c9bc67fb4a0447695309881529fa534d2"}, - {file = "numba-0.63.0-cp311-cp311-win_amd64.whl", hash = "sha256:b24bfffc2e877581ff13cc3e041f69224939c616dd2b1ef20cd856e743ad66fd"}, - {file = "numba-0.63.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7e15588b2e3f4ea8c74d294d0a9f3b8262e7a34c4c5b4f5850d5779334a13d20"}, - {file = "numba-0.63.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2279cbcd459440eca36be1078ad14e96a7a339124065a83577feb8849d28453d"}, - {file = "numba-0.63.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18add2794439ae7289f1836ef73901b2a9e2c1fd91f7389af0c447626955519d"}, - {file = "numba-0.63.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5343da27ce538bf60b05397d378ae71e16c7fea99a5973a7f42eb51d3471e20"}, - {file = "numba-0.63.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:26859efd087b9eafc3da450a69a0211903622f723fce2da9aef84f04c01f804b"}, - {file = "numba-0.63.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:491fd265dd3ca837d31486fd688ec5331eb79213160c88fcb18555d128dc99ac"}, - {file = "numba-0.63.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2dd16fceaffff2f6f36abe9f03291e1d732b7d0978524a315f1ea9ee380d1859"}, - {file = "numba-0.63.0-cp313-cp313-win_amd64.whl", hash = "sha256:78ff9d00ab3374f87683bf902195c990049cdea7dd2d82d09b5a9fcdb68f6ae0"}, - {file = "numba-0.63.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:7593766bd0210d8dece4915de8a9e5393e93fdc6ad1ca8576555645a926cfe3a"}, - {file = "numba-0.63.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1c1d3bd8757d2ee674ba8b20b34c1f5aa5d40ba4e5d6424d608e8e880deeb7b2"}, - {file = "numba-0.63.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0f30d46a9e4b7699cdeeb020e1e323352266f147f1b8a3feb0d4d37cde2fddd"}, - {file = "numba-0.63.0-cp314-cp314-win_amd64.whl", hash = "sha256:34f59efe05e5237ed8cd4d592303467eac5b8fdc6fac716542140e6bcb5f9d7c"}, - {file = "numba-0.63.0.tar.gz", hash = "sha256:27e525ce6f9f727c4f61e89b9d453d4a7d0aabbbf110278988334f43cbd70fdc"}, + {file = "numba-0.63.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6d6bf5bf00f7db629305caaec82a2ffb8abe2bf45eaad0d0738dc7de4113779"}, + {file = "numba-0.63.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08653d0dfc9cc9c4c9a8fba29ceb1f2d5340c3b86c4a7e5e07e42b643bc6a2f4"}, + {file = "numba-0.63.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f09eebf5650246ce2a4e9a8d38270e2d4b0b0ae978103bafb38ed7adc5ea906e"}, + {file = "numba-0.63.1-cp310-cp310-win_amd64.whl", hash = "sha256:f8bba17421d865d8c0f7be2142754ebce53e009daba41c44cf6909207d1a8d7d"}, + {file = "numba-0.63.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b33db00f18ccc790ee9911ce03fcdfe9d5124637d1ecc266f5ae0df06e02fec3"}, + {file = "numba-0.63.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7d31ea186a78a7c0f6b1b2a3fe68057fdb291b045c52d86232b5383b6cf4fc25"}, + {file = "numba-0.63.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed3bb2fbdb651d6aac394388130a7001aab6f4541837123a4b4ab8b02716530c"}, + {file = "numba-0.63.1-cp311-cp311-win_amd64.whl", hash = "sha256:1ecbff7688f044b1601be70113e2fb1835367ee0b28ffa8f3adf3a05418c5c87"}, + {file = "numba-0.63.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2819cd52afa5d8d04e057bdfd54367575105f8829350d8fb5e4066fb7591cc71"}, + {file = "numba-0.63.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5cfd45dbd3d409e713b1ccfdc2ee72ca82006860254429f4ef01867fdba5845f"}, + {file = "numba-0.63.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69a599df6976c03b7ecf15d05302696f79f7e6d10d620367407517943355bcb0"}, + {file = "numba-0.63.1-cp312-cp312-win_amd64.whl", hash = "sha256:bbad8c63e4fc7eb3cdb2c2da52178e180419f7969f9a685f283b313a70b92af3"}, + {file = "numba-0.63.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:0bd4fd820ef7442dcc07da184c3f54bb41d2bdb7b35bacf3448e73d081f730dc"}, + {file = "numba-0.63.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53de693abe4be3bd4dee38e1c55f01c55ff644a6a3696a3670589e6e4c39cde2"}, + {file = "numba-0.63.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81227821a72a763c3d4ac290abbb4371d855b59fdf85d5af22a47c0e86bf8c7e"}, + {file = "numba-0.63.1-cp313-cp313-win_amd64.whl", hash = "sha256:eb227b07c2ac37b09432a9bda5142047a2d1055646e089d4a240a2643e508102"}, + {file = "numba-0.63.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:f180883e5508940cc83de8a8bea37fc6dd20fbe4e5558d4659b8b9bef5ff4731"}, + {file = "numba-0.63.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0938764afa82a47c0e895637a6c55547a42c9e1d35cac42285b1fa60a8b02bb"}, + {file = "numba-0.63.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f90a929fa5094e062d4e0368ede1f4497d5e40f800e80aa5222c4734236a2894"}, + {file = "numba-0.63.1-cp314-cp314-win_amd64.whl", hash = "sha256:8d6d5ce85f572ed4e1a135dbb8c0114538f9dd0e3657eeb0bb64ab204cbe2a8f"}, + {file = "numba-0.63.1.tar.gz", hash = "sha256:b320aa675d0e3b17b40364935ea52a7b1c670c9037c39cf92c49502a75902f4b"}, ] [package.dependencies] @@ -2857,13 +2857,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/ngram/poetry.lock b/security_scanning/examples/ngram/poetry.lock index e68bc97032..88fb476241 100644 --- a/security_scanning/examples/ngram/poetry.lock +++ b/security_scanning/examples/ngram/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1821,13 +1821,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/quantization/poetry.lock b/security_scanning/examples/quantization/poetry.lock index 7490b5d493..aa13aec255 100644 --- a/security_scanning/examples/quantization/poetry.lock +++ b/security_scanning/examples/quantization/poetry.lock @@ -736,13 +736,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1846,18 +1846,18 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "5.0.0rc0" +version = "5.0.0rc1" description = "Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training." optional = false python-versions = ">=3.10.0" files = [ - {file = "transformers-5.0.0rc0-py3-none-any.whl", hash = "sha256:1935f8b396891c93b8520d951d4385da1b1b778914e1d79ed151ddbd32d83a22"}, - {file = "transformers-5.0.0rc0.tar.gz", hash = "sha256:bb427000caa4a88943704f80448b2323ad8c6a2f4f13c1433e27d0a1f690c975"}, + {file = "transformers-5.0.0rc1-py3-none-any.whl", hash = "sha256:8b9604700769872cab4280dbcde201f557e93f72ee5a85c4592275ab4f15d330"}, + {file = "transformers-5.0.0rc1.tar.gz", hash = "sha256:1fdde557b96ef8ea277c45b8e0d558f1e167fe28a98593f4c4aec0277e335821"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=1.0.0,<2.0" +huggingface-hub = ">=1.2.1,<2.0" numpy = ">=1.17" packaging = ">=20.0" pyyaml = ">=5.1" @@ -1870,20 +1870,20 @@ typer-slim = "*" [package.extras] accelerate = ["accelerate (>=1.1.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "av", "codecarbon (>=2.8.1)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "av", "codecarbon (>=2.8.1)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] chat-template = ["jinja2 (>=3.1.0)", "jmespath (>=1.0.1)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=1.1.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] -dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] +dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jinja2 (>=3.1.0)", "jmespath (>=1.0.1)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] +dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=1.1.0)", "accelerate (>=1.1.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.10.2,<0.11)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "openai (>=1.98.0)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)", "uvicorn"] ftfy = ["ftfy"] -hf-xet = ["hf-xet"] +hf-xet = ["hf_xet"] hub-kernels = ["kernels (>=0.10.2,<0.11)"] integrations = ["kernels (>=0.10.2,<0.11)", "optuna", "ray[tune] (>=2.7.0)"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] @@ -1899,14 +1899,14 @@ sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] serving = ["accelerate (>=1.1.0)", "fastapi", "openai (>=1.98.0)", "pydantic (>=2)", "rich", "starlette", "torch (>=2.2)", "uvicorn"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] +testing = ["GitPython (<3.1.19)", "accelerate (>=1.1.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.4.6)", "faiss-cpu", "fastapi", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "openai (>=1.98.0)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pydantic (>=2)", "pytest (>=7.2.0,<9.0.0)", "pytest-asyncio (>=1.2.0)", "pytest-order", "pytest-rerunfailures (<16.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.13.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "starlette", "tensorboard", "timeout-decorator", "torch (>=2.2)", "uvicorn"] tiktoken = ["blobfile", "tiktoken"] timm = ["timm (!=1.0.18,<=1.0.19)"] tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"] torch = ["accelerate (>=1.1.0)", "torch (>=2.2)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=1.0.0,<2.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=1.2.1,<2.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -1965,13 +1965,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/ray_orchestrator/poetry.lock b/security_scanning/examples/ray_orchestrator/poetry.lock index 9a3fd66660..7bfa4210c9 100644 --- a/security_scanning/examples/ray_orchestrator/poetry.lock +++ b/security_scanning/examples/ray_orchestrator/poetry.lock @@ -1032,13 +1032,13 @@ files = [ [[package]] name = "opentelemetry-api" -version = "1.39.0" +version = "1.39.1" description = "OpenTelemetry Python API" optional = false python-versions = ">=3.9" files = [ - {file = "opentelemetry_api-1.39.0-py3-none-any.whl", hash = "sha256:3c3b3ca5c5687b1b5b37e5c5027ff68eacea8675241b29f13110a8ffbb8f0459"}, - {file = "opentelemetry_api-1.39.0.tar.gz", hash = "sha256:6130644268c5ac6bdffaf660ce878f10906b3e789f7e2daa5e169b047a2933b9"}, + {file = "opentelemetry_api-1.39.1-py3-none-any.whl", hash = "sha256:2edd8463432a7f8443edce90972169b195e7d6a05500cd29e6d13898187c9950"}, + {file = "opentelemetry_api-1.39.1.tar.gz", hash = "sha256:fbde8c80e1b937a2c61f20347e91c0c18a1940cecf012d62e65a7caf08967c9c"}, ] [package.dependencies] @@ -1047,29 +1047,29 @@ typing-extensions = ">=4.5.0" [[package]] name = "opentelemetry-exporter-prometheus" -version = "0.60b0" +version = "0.60b1" description = "Prometheus Metric Exporter for OpenTelemetry" optional = false python-versions = ">=3.9" files = [ - {file = "opentelemetry_exporter_prometheus-0.60b0-py3-none-any.whl", hash = "sha256:4f616397040257fae4c5e5272b57b47c13372e3b7f0f2db2427fd4dbe69c60b5"}, - {file = "opentelemetry_exporter_prometheus-0.60b0.tar.gz", hash = "sha256:c6ae33e52cdd1dbfed1f7436935df94eb03c725b57322026d04e6fbc37108e6e"}, + {file = "opentelemetry_exporter_prometheus-0.60b1-py3-none-any.whl", hash = "sha256:49f59178de4f4590e3cef0b8b95cf6e071aae70e1f060566df5546fad773b8fd"}, + {file = "opentelemetry_exporter_prometheus-0.60b1.tar.gz", hash = "sha256:a4011b46906323f71724649d301b4dc188aaa068852e814f4df38cc76eac616b"}, ] [package.dependencies] opentelemetry-api = ">=1.12,<2.0" -opentelemetry-sdk = ">=1.39.0,<1.40.0" +opentelemetry-sdk = ">=1.39.1,<1.40.0" prometheus-client = ">=0.5.0,<1.0.0" [[package]] name = "opentelemetry-proto" -version = "1.39.0" +version = "1.39.1" description = "OpenTelemetry Python Proto" optional = false python-versions = ">=3.9" files = [ - {file = "opentelemetry_proto-1.39.0-py3-none-any.whl", hash = "sha256:1e086552ac79acb501485ff0ce75533f70f3382d43d0a30728eeee594f7bf818"}, - {file = "opentelemetry_proto-1.39.0.tar.gz", hash = "sha256:c1fa48678ad1a1624258698e59be73f990b7fc1f39e73e16a9d08eef65dd838c"}, + {file = "opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007"}, + {file = "opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8"}, ] [package.dependencies] @@ -1077,33 +1077,33 @@ protobuf = ">=5.0,<7.0" [[package]] name = "opentelemetry-sdk" -version = "1.39.0" +version = "1.39.1" description = "OpenTelemetry Python SDK" optional = false python-versions = ">=3.9" files = [ - {file = "opentelemetry_sdk-1.39.0-py3-none-any.whl", hash = "sha256:90cfb07600dfc0d2de26120cebc0c8f27e69bf77cd80ef96645232372709a514"}, - {file = "opentelemetry_sdk-1.39.0.tar.gz", hash = "sha256:c22204f12a0529e07aa4d985f1bca9d6b0e7b29fe7f03e923548ae52e0e15dde"}, + {file = "opentelemetry_sdk-1.39.1-py3-none-any.whl", hash = "sha256:4d5482c478513ecb0a5d938dcc61394e647066e0cc2676bee9f3af3f3f45f01c"}, + {file = "opentelemetry_sdk-1.39.1.tar.gz", hash = "sha256:cf4d4563caf7bff906c9f7967e2be22d0d6b349b908be0d90fb21c8e9c995cc6"}, ] [package.dependencies] -opentelemetry-api = "1.39.0" -opentelemetry-semantic-conventions = "0.60b0" +opentelemetry-api = "1.39.1" +opentelemetry-semantic-conventions = "0.60b1" typing-extensions = ">=4.5.0" [[package]] name = "opentelemetry-semantic-conventions" -version = "0.60b0" +version = "0.60b1" description = "OpenTelemetry Semantic Conventions" optional = false python-versions = ">=3.9" files = [ - {file = "opentelemetry_semantic_conventions-0.60b0-py3-none-any.whl", hash = "sha256:069530852691136018087b52688857d97bba61cd641d0f8628d2d92788c4f78a"}, - {file = "opentelemetry_semantic_conventions-0.60b0.tar.gz", hash = "sha256:227d7aa73cbb8a2e418029d6b6465553aa01cf7e78ec9d0bc3255c7b3ac5bf8f"}, + {file = "opentelemetry_semantic_conventions-0.60b1-py3-none-any.whl", hash = "sha256:9fa8c8b0c110da289809292b0591220d3a7b53c1526a23021e977d68597893fb"}, + {file = "opentelemetry_semantic_conventions-0.60b1.tar.gz", hash = "sha256:87c228b5a0669b748c76d76df6c364c369c28f1c465e50f661e39737e84bc953"}, ] [package.dependencies] -opentelemetry-api = "1.39.0" +opentelemetry-api = "1.39.1" typing-extensions = ">=4.5.0" [[package]] @@ -1902,13 +1902,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/redrafter/poetry.lock b/security_scanning/examples/redrafter/poetry.lock index 03e1ded960..c23e474e3b 100644 --- a/security_scanning/examples/redrafter/poetry.lock +++ b/security_scanning/examples/redrafter/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.1" +version = "1.2.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.1-py3-none-any.whl", hash = "sha256:8c74a41a16156337dfa1090873ca11f8c1d7b6efcbac9f6673d008a740207e6a"}, - {file = "huggingface_hub-1.2.1.tar.gz", hash = "sha256:1aced061fa1bd443c0ec80a4af432b8b70041d54860f7af334ceff599611a415"}, + {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, + {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, ] [package.dependencies] @@ -1831,13 +1831,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/examples/trtllm-eval/poetry.lock b/security_scanning/examples/trtllm-eval/poetry.lock index 150bbc6c69..970031c4f9 100644 --- a/security_scanning/examples/trtllm-eval/poetry.lock +++ b/security_scanning/examples/trtllm-eval/poetry.lock @@ -3264,13 +3264,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 6f015659d4..23ee601d5b 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "36c9e7cfe670db782d69f37bcc772baaa5c86ff1", - "timestamp": "2025-12-10T02:39:25Z" + "commit_hash": "e8efeb765d7b2a23e123e80ed10dc7f98348e790", + "timestamp": "2025-12-12T02:39:32Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index 36dbdfcc91..edc77a01f3 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -3008,13 +3008,13 @@ onnx = ">=1.14.0" [[package]] name = "openai" -version = "2.9.0" +version = "2.11.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.9" files = [ - {file = "openai-2.9.0-py3-none-any.whl", hash = "sha256:0d168a490fbb45630ad508a6f3022013c155a68fd708069b6a1a01a5e8f0ffad"}, - {file = "openai-2.9.0.tar.gz", hash = "sha256:b52ec65727fc8f1eed2fbc86c8eac0998900c7ef63aa2eb5c24b69717c56fa5f"}, + {file = "openai-2.11.0-py3-none-any.whl", hash = "sha256:21189da44d2e3d027b08c7a920ba4454b8b7d6d30ae7e64d9de11dbe946d4faa"}, + {file = "openai-2.11.0.tar.gz", hash = "sha256:b3da01d92eda31524930b6ec9d7167c535e843918d7ba8a76b1c38f1104f321e"}, ] [package.dependencies] @@ -5364,13 +5364,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] @@ -5773,4 +5773,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "3ec3d9eabf7664da1722a32823997c383d7eab2dc54fa2c10e67849245300beb" +content-hash = "b0bf1650e7b7f69715b76d39327eb3223e6b6f5833582562d252f4c07d3f3ec9" diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index 562500afc0..274d314db8 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -18,7 +18,7 @@ mpi4py = "^4.1.1" numpy = "<2" onnx = ">=1.18.0,<1.20.0" onnx-graphsurgeon = ">=0.5.2" -openai = "^2.9.0" +openai = "^2.11.0" polygraphy = "^0.49.26" psutil = "^7.1.3" nvidia-ml-py = ">=13" diff --git a/security_scanning/tests/integration/defs/perf/poetry.lock b/security_scanning/tests/integration/defs/perf/poetry.lock index 1c5e58588a..f5bc59f321 100644 --- a/security_scanning/tests/integration/defs/perf/poetry.lock +++ b/security_scanning/tests/integration/defs/perf/poetry.lock @@ -275,66 +275,66 @@ files = [ [[package]] name = "matplotlib" -version = "3.10.7" +version = "3.10.8" description = "Python plotting package" optional = false python-versions = ">=3.10" files = [ - {file = "matplotlib-3.10.7-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7ac81eee3b7c266dd92cee1cd658407b16c57eed08c7421fa354ed68234de380"}, - {file = "matplotlib-3.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:667ecd5d8d37813a845053d8f5bf110b534c3c9f30e69ebd25d4701385935a6d"}, - {file = "matplotlib-3.10.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc1c51b846aca49a5a8b44fbba6a92d583a35c64590ad9e1e950dc88940a4297"}, - {file = "matplotlib-3.10.7-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a11c2e9e72e7de09b7b72e62f3df23317c888299c875e2b778abf1eda8c0a42"}, - {file = "matplotlib-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f19410b486fdd139885ace124e57f938c1e6a3210ea13dd29cab58f5d4bc12c7"}, - {file = "matplotlib-3.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:b498e9e4022f93de2d5a37615200ca01297ceebbb56fe4c833f46862a490f9e3"}, - {file = "matplotlib-3.10.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:53b492410a6cd66c7a471de6c924f6ede976e963c0f3097a3b7abfadddc67d0a"}, - {file = "matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9749313deb729f08207718d29c86246beb2ea3fdba753595b55901dee5d2fd6"}, - {file = "matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2222c7ba2cbde7fe63032769f6eb7e83ab3227f47d997a8453377709b7fe3a5a"}, - {file = "matplotlib-3.10.7-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e91f61a064c92c307c5a9dc8c05dc9f8a68f0a3be199d9a002a0622e13f874a1"}, - {file = "matplotlib-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f1851eab59ca082c95df5a500106bad73672645625e04538b3ad0f69471ffcc"}, - {file = "matplotlib-3.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:6516ce375109c60ceec579e699524e9d504cd7578506f01150f7a6bc174a775e"}, - {file = "matplotlib-3.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:b172db79759f5f9bc13ef1c3ef8b9ee7b37b0247f987fbbbdaa15e4f87fd46a9"}, - {file = "matplotlib-3.10.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7a0edb7209e21840e8361e91ea84ea676658aa93edd5f8762793dec77a4a6748"}, - {file = "matplotlib-3.10.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c380371d3c23e0eadf8ebff114445b9f970aff2010198d498d4ab4c3b41eea4f"}, - {file = "matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5f256d49fea31f40f166a5e3131235a5d2f4b7f44520b1cf0baf1ce568ccff0"}, - {file = "matplotlib-3.10.7-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11ae579ac83cdf3fb72573bb89f70e0534de05266728740d478f0f818983c695"}, - {file = "matplotlib-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4c14b6acd16cddc3569a2d515cfdd81c7a68ac5639b76548cfc1a9e48b20eb65"}, - {file = "matplotlib-3.10.7-cp312-cp312-win_amd64.whl", hash = "sha256:0d8c32b7ea6fb80b1aeff5a2ceb3fb9778e2759e899d9beff75584714afcc5ee"}, - {file = "matplotlib-3.10.7-cp312-cp312-win_arm64.whl", hash = "sha256:5f3f6d315dcc176ba7ca6e74c7768fb7e4cf566c49cb143f6bc257b62e634ed8"}, - {file = "matplotlib-3.10.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1d9d3713a237970569156cfb4de7533b7c4eacdd61789726f444f96a0d28f57f"}, - {file = "matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37a1fea41153dd6ee061d21ab69c9cf2cf543160b1b85d89cd3d2e2a7902ca4c"}, - {file = "matplotlib-3.10.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b3c4ea4948d93c9c29dc01c0c23eef66f2101bf75158c291b88de6525c55c3d1"}, - {file = "matplotlib-3.10.7-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22df30ffaa89f6643206cf13877191c63a50e8f800b038bc39bee9d2d4957632"}, - {file = "matplotlib-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b69676845a0a66f9da30e87f48be36734d6748024b525ec4710be40194282c84"}, - {file = "matplotlib-3.10.7-cp313-cp313-win_amd64.whl", hash = "sha256:744991e0cc863dd669c8dc9136ca4e6e0082be2070b9d793cbd64bec872a6815"}, - {file = "matplotlib-3.10.7-cp313-cp313-win_arm64.whl", hash = "sha256:fba2974df0bf8ce3c995fa84b79cde38326e0f7b5409e7a3a481c1141340bcf7"}, - {file = "matplotlib-3.10.7-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:932c55d1fa7af4423422cb6a492a31cbcbdbe68fd1a9a3f545aa5e7a143b5355"}, - {file = "matplotlib-3.10.7-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e38c2d581d62ee729a6e144c47a71b3f42fb4187508dbbf4fe71d5612c3433b"}, - {file = "matplotlib-3.10.7-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:786656bb13c237bbcebcd402f65f44dd61ead60ee3deb045af429d889c8dbc67"}, - {file = "matplotlib-3.10.7-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09d7945a70ea43bf9248f4b6582734c2fe726723204a76eca233f24cffc7ef67"}, - {file = "matplotlib-3.10.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d0b181e9fa8daf1d9f2d4c547527b167cb8838fc587deabca7b5c01f97199e84"}, - {file = "matplotlib-3.10.7-cp313-cp313t-win_amd64.whl", hash = "sha256:31963603041634ce1a96053047b40961f7a29eb8f9a62e80cc2c0427aa1d22a2"}, - {file = "matplotlib-3.10.7-cp313-cp313t-win_arm64.whl", hash = "sha256:aebed7b50aa6ac698c90f60f854b47e48cd2252b30510e7a1feddaf5a3f72cbf"}, - {file = "matplotlib-3.10.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d883460c43e8c6b173fef244a2341f7f7c0e9725c7fe68306e8e44ed9c8fb100"}, - {file = "matplotlib-3.10.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07124afcf7a6504eafcb8ce94091c5898bbdd351519a1beb5c45f7a38c67e77f"}, - {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c17398b709a6cce3d9fdb1595c33e356d91c098cd9486cb2cc21ea2ea418e715"}, - {file = "matplotlib-3.10.7-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7146d64f561498764561e9cd0ed64fcf582e570fc519e6f521e2d0cfd43365e1"}, - {file = "matplotlib-3.10.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:90ad854c0a435da3104c01e2c6f0028d7e719b690998a2333d7218db80950722"}, - {file = "matplotlib-3.10.7-cp314-cp314-win_amd64.whl", hash = "sha256:4645fc5d9d20ffa3a39361fcdbcec731382763b623b72627806bf251b6388866"}, - {file = "matplotlib-3.10.7-cp314-cp314-win_arm64.whl", hash = "sha256:9257be2f2a03415f9105c486d304a321168e61ad450f6153d77c69504ad764bb"}, - {file = "matplotlib-3.10.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1e4bbad66c177a8fdfa53972e5ef8be72a5f27e6a607cec0d8579abd0f3102b1"}, - {file = "matplotlib-3.10.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d8eb7194b084b12feb19142262165832fc6ee879b945491d1c3d4660748020c4"}, - {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4d41379b05528091f00e1728004f9a8d7191260f3862178b88e8fd770206318"}, - {file = "matplotlib-3.10.7-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a74f79fafb2e177f240579bc83f0b60f82cc47d2f1d260f422a0627207008ca"}, - {file = "matplotlib-3.10.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:702590829c30aada1e8cef0568ddbffa77ca747b4d6e36c6d173f66e301f89cc"}, - {file = "matplotlib-3.10.7-cp314-cp314t-win_amd64.whl", hash = "sha256:f79d5de970fc90cd5591f60053aecfce1fcd736e0303d9f0bf86be649fa68fb8"}, - {file = "matplotlib-3.10.7-cp314-cp314t-win_arm64.whl", hash = "sha256:cb783436e47fcf82064baca52ce748af71725d0352e1d31564cbe9c95df92b9c"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5c09cf8f2793f81368f49f118b6f9f937456362bee282eac575cca7f84cda537"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:de66744b2bb88d5cd27e80dfc2ec9f0517d0a46d204ff98fe9e5f2864eb67657"}, - {file = "matplotlib-3.10.7-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53cc80662dd197ece414dd5b66e07370201515a3eaf52e7c518c68c16814773b"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15112bcbaef211bd663fa935ec33313b948e214454d949b723998a43357b17b0"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2a959c640cdeecdd2ec3136e8ea0441da59bcaf58d67e9c590740addba2cb68"}, - {file = "matplotlib-3.10.7-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3886e47f64611046bc1db523a09dd0a0a6bed6081e6f90e13806dd1d1d1b5e91"}, - {file = "matplotlib-3.10.7.tar.gz", hash = "sha256:a06ba7e2a2ef9131c79c49e63dad355d2d878413a0376c1727c8b9335ff731c7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"}, + {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"}, + {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"}, + {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"}, + {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"}, + {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"}, + {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"}, + {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"}, + {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"}, + {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"}, ] [package.dependencies] @@ -678,4 +678,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "fb2c9ed605f98095c89f5253045dfff02592fdaab4cc75e9adbb6ffc304fe751" +content-hash = "89e80cae69b517c09e5f788e36eec00c4589fe57d9ac74d1434fdd5e9a0dd481" diff --git a/security_scanning/tests/integration/defs/perf/pyproject.toml b/security_scanning/tests/integration/defs/perf/pyproject.toml index e8e15b43e1..5f262cc27d 100644 --- a/security_scanning/tests/integration/defs/perf/pyproject.toml +++ b/security_scanning/tests/integration/defs/perf/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" python = ">=3.10,<3.13" pandas = "^2.3.3" numpy = "<2" -matplotlib = "^3.10.7" +matplotlib = "^3.10.8" [build-system] diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock index 2bc400198c..6bf91b1756 100644 --- a/security_scanning/triton_backend/poetry.lock +++ b/security_scanning/triton_backend/poetry.lock @@ -959,13 +959,13 @@ files = [ [[package]] name = "urllib3" -version = "2.6.1" +version = "2.6.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.6.1-py3-none-any.whl", hash = "sha256:e67d06fe947c36a7ca39f4994b08d73922d40e6cca949907be05efa6fd75110b"}, - {file = "urllib3-2.6.1.tar.gz", hash = "sha256:5379eb6e1aba4088bae84f8242960017ec8d8e3decf30480b3a1abdaa9671a3f"}, + {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, + {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] [package.extras] From 093465ed29bd9a6060e7eec6bbdf6ac64f35f2df Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:24:05 +0800 Subject: [PATCH 086/172] [https://nvbugs/5599176][fix] Unwaive fixed test for Ray (#9861) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index b4261692f8..769d371651 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -300,7 +300,6 @@ full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search S full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP (https://nvbugs/5568052) full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337) accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847) -unittest/llmapi/test_llm_pytorch.py::test_llm_capture_request_error SKIP (https://nvbugs/5599176) examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143) unittest/llmapi/test_memory_profiling.py SKIP (https://nvbugs/5580781) triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414) From 3e39afea9a42c21c645c2ac9b2036cd4993db5a1 Mon Sep 17 00:00:00 2001 From: Yiteng Niu <6831097+niukuo@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:27:42 +0800 Subject: [PATCH 087/172] [None][infra] update nspect version for api change (#9899) Signed-off-by: Yiteng Niu <6831097+niukuo@users.noreply.github.com> --- jenkins/BuildDockerImage.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 7af7908827..5049965f5e 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -702,7 +702,7 @@ pipeline { container("python3") { trtllm_utils.llmExecStepWithRetry(this, script: "pip3 install --upgrade pip") trtllm_utils.llmExecStepWithRetry(this, script: "pip3 install --upgrade requests") - def nspect_commit = "0e46042381ae25cb7af2f1d45853dfd8e1d54e2d" + def nspect_commit = "4cb9c0c42d44ebeeba1e40d2c3eb6aab6fb90173" withCredentials([string(credentialsId: "TRTLLM_NSPECT_REPO", variable: "NSPECT_REPO")]) { trtllm_utils.checkoutSource("${NSPECT_REPO}", nspect_commit, "nspect") } @@ -723,6 +723,7 @@ pipeline { cmd += "--check_launch_api " cmd += "--wait_success ${params.wait_success_seconds} " } + cmd += "--image " cmd += imageKeyToTag.values().join(" ") withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) { trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 6, shortCommondRunTimeMax: 7200) From bd441e9822c829b649c6884ceb5494a892331414 Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:37:19 +0800 Subject: [PATCH 088/172] [None][infra] revert ucx to 1.19 (#9936) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> --- docker/common/install_ucx.sh | 2 +- jenkins/current_image_tags.properties | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh index f20e99a52e..55da81e2c2 100644 --- a/docker/common/install_ucx.sh +++ b/docker/common/install_ucx.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -UCX_VERSION="v1.20.x" +UCX_VERSION="v1.19.x" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" UCX_REPO="https://github.com/openucx/ucx.git" diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index dad998f814..ed5f0078bd 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512110629-9786 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512110629-9786 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512110629-9786 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512110629-9786 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512091705-9823 From 110820bb154cb9642e8210d96a4d630132055317 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Fri, 12 Dec 2025 12:12:08 +0800 Subject: [PATCH 089/172] [TRTLLM-9792] [feat] Support multiple instances on single node for slurm scripts (#9900) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../slurm/benchmark/disaggr_torch.slurm | 142 +++++----------- .../slurm/benchmark/gen_server_config.py | 28 ++-- .../slurm/benchmark/start_worker.sh | 4 +- .../disaggregated/slurm/benchmark/submit.py | 154 +++++++++++++++--- 4 files changed, 184 insertions(+), 144 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index 2235767fa9..0e2c7e64d8 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -4,19 +4,11 @@ set -euo pipefail # Parse named arguments while [[ $# -gt 0 ]]; do case $1 in - # Hardware configuration - --gpus-per-node) gpus_per_node="$2"; shift 2 ;; - --numa-bind) numa_bind="$2"; shift 2 ;; - --ctx-nodes) ctx_nodes="$2"; shift 2 ;; - --gen-nodes) gen_nodes="$2"; shift 2 ;; - --ctx-world-size) ctx_world_size="$2"; shift 2 ;; - --gen-world-size) gen_world_size="$2"; shift 2 ;; # Worker configuration --num-ctx-servers) num_ctx_servers="$2"; shift 2 ;; - --ctx-config-path) ctx_config_path="$2"; shift 2 ;; --num-gen-servers) num_gen_servers="$2"; shift 2 ;; - --gen-config-path) gen_config_path="$2"; shift 2 ;; --concurrency-list) concurrency_list="$2"; shift 2 ;; + # Sequence and benchmark parameters --isl) isl="$2"; shift 2 ;; --osl) osl="$2"; shift 2 ;; @@ -24,8 +16,7 @@ while [[ $# -gt 0 ]]; do --benchmark-ratio) benchmark_ratio="$2"; shift 2 ;; --streaming) streaming="$2"; shift 2 ;; --use-nv-sa-benchmark) use_nv_sa_benchmark="$2"; shift 2 ;; - --benchmark-mode) benchmark_mode="$2"; shift 2 ;; - --cache-max-tokens) cache_max_tokens="$2"; shift 2 ;; + # Environment and paths --dataset-file) dataset_file="$2"; shift 2 ;; --model-path) model_path="$2"; shift 2 ;; @@ -36,17 +27,13 @@ while [[ $# -gt 0 ]]; do --container-image) container_image="$2"; shift 2 ;; --build-wheel) build_wheel="$2"; shift 2 ;; --trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;; - # Profiling - --nsys-on) nsys_on="$2"; shift 2 ;; - --ctx-profile-range) ctx_profile_range="$2"; shift 2 ;; - --gen-profile-range) gen_profile_range="$2"; shift 2 ;; + # Accuracy evaluation --enable-accuracy-test) enable_accuracy_test="$2"; shift 2 ;; --accuracy-model) accuracy_model="$2"; shift 2 ;; --accuracy-tasks) accuracy_tasks="$2"; shift 2 ;; --model-args-extra) model_args_extra="$2"; shift 2 ;; - # Worker environment variables - --worker-env-var) worker_env_var="$2"; shift 2 ;; + # Server environment variables --server-env-var) server_env_var="$2"; shift 2 ;; *) @@ -58,43 +45,30 @@ done # Print all parsed arguments echo "Parsed arguments:" -echo "Hardware Configuration:" -echo " gpus_per_node: ${gpus_per_node}" -echo " numa_bind: ${numa_bind}" -echo " ctx_nodes: ${ctx_nodes}" -echo " gen_nodes: ${gen_nodes}" -echo " ctx_world_size: ${ctx_world_size}" -echo " gen_world_size: ${gen_world_size}" echo echo "Worker Configuration:" echo " num_ctx_servers: ${num_ctx_servers}" -echo " ctx_config_path: ${ctx_config_path}" echo " num_gen_servers: ${num_gen_servers}" -echo " gen_config_path: ${gen_config_path}" echo " concurrency_list: ${concurrency_list}" echo echo "Benchmark Configuration:" -echo " use_nv_sa_benchmark: ${use_nv_sa_benchmark}" echo " isl: ${isl}" echo " osl: ${osl}" echo " multi_round: ${multi_round}" echo " benchmark_ratio: ${benchmark_ratio}" echo " streaming: ${streaming}" -echo " cache_max_tokens: ${cache_max_tokens}" -echo " benchmark_mode: ${benchmark_mode}" +echo " use_nv_sa_benchmark: ${use_nv_sa_benchmark}" echo echo "Environment Configuration:" echo " dataset_file: ${dataset_file}" -echo " container_mount: ${container_mount}" -echo " container_image: ${container_image}" echo " model_path: ${model_path}" echo " trtllm_repo: ${trtllm_repo}" +echo " work_dir: ${work_dir}" +echo " full_logdir: ${full_logdir}" +echo " container_mount: ${container_mount}" +echo " container_image: ${container_image}" echo " build_wheel: ${build_wheel}" echo " trtllm_wheel_path: ${trtllm_wheel_path}" -echo " work_dir: ${work_dir}" -echo " nsys_on: ${nsys_on}" -echo " ctx_profile_range: ${ctx_profile_range}" -echo " gen_profile_range: ${gen_profile_range}" echo echo "Accuracy Configuration:" echo " enable_accuracy_test: ${enable_accuracy_test}" @@ -102,16 +76,11 @@ echo " accuracy_model: ${accuracy_model}" echo " accuracy_tasks: ${accuracy_tasks}" echo " model_args_extra: ${model_args_extra}" echo -echo "Worker Environment Variables:" -echo " worker_env_var: ${worker_env_var}" -echo echo "Server Environment Variables:" echo " server_env_var: ${server_env_var}" container_name="disaggr-test" -echo "Log directory: ${full_logdir}" - # Function to cleanup on failure cleanup_on_failure() { echo "Error: $1" @@ -128,8 +97,8 @@ if ! srun -l --container-image=${container_image} \ --container-name=${container_name} \ --container-mounts=${container_mount} \ --mpi=pmix \ - echo "Container up." &> ${full_logdir}/container_launch.log; then - cleanup_on_failure "Failed to start container. Check ${full_logdir}/container_launch.log" + echo "Container up." &> ${full_logdir}/1_container_launch.log; then + cleanup_on_failure "Failed to start container. Check ${full_logdir}/1_container_launch.log" fi # Install TensorRT-LLM @@ -140,8 +109,8 @@ if [ -n "${trtllm_wheel_path}" ]; then --container-mounts=${container_mount} --no-container-mount-home \ --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ bash -c "pip install ${trtllm_wheel_path}" \ - &> ${full_logdir}/install.log; then - cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/install.log for details" + &> ${full_logdir}/2_install.log; then + cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details" fi echo "TensorRT-LLM wheel installation completed successfully" elif [ -d "${trtllm_repo}" ]; then @@ -157,8 +126,8 @@ elif [ -d "${trtllm_repo}" ]; then --container-mounts=${container_mount} \ --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \ bash -c "cd ${trtllm_repo} && ${build_command}" \ - &> ${full_logdir}/build.log; then - cleanup_on_failure "TensorRT-LLM build failed. Check ${full_logdir}/build.log for details" + &> ${full_logdir}/2_build.log; then + cleanup_on_failure "TensorRT-LLM build failed. Check ${full_logdir}/2_build.log for details" fi echo "TensorRT-LLM build completed successfully" fi @@ -168,60 +137,33 @@ elif [ -d "${trtllm_repo}" ]; then --container-mounts=${container_mount} --no-container-mount-home \ --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ bash -c "cd ${trtllm_repo} && pip install -e ." \ - &> ${full_logdir}/install.log; then - cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/install.log for details" + &> ${full_logdir}/2_install.log; then + cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details" fi echo "TensorRT-LLM installation completed successfully" fi -# Get node lists +# Get node lists and replace the placeholder with the actual node names +echo "SLURM_NODELIST: ${SLURM_NODELIST}" all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort)) -total_nodes_num=${#all_nodes[@]} -echo "all_nodes: ${all_nodes[@]}, total_nodes_num: ${total_nodes_num}" +all_nodes_str=$(IFS=','; echo "${all_nodes[*]}") +echo "all_nodes_str: ${all_nodes_str}" -# Split nodes between gen and ctx workers -gen_node_list=(${all_nodes[@]:0:${gen_nodes}}) -ctx_node_list=(${all_nodes[@]:${gen_nodes}:${total_nodes_num}}) +start_worker_cmds_file=${full_logdir}/start_worker_cmds.txt +IFS=',' read -r -a node_array <<< "$all_nodes_str" +for i in "${!node_array[@]}"; do + current_val="${node_array[$i]}" + placeholder="" -echo "gen_nodes: ${gen_node_list[@]}, num_nodes: ${gen_nodes}" -echo "ctx_nodes: ${ctx_node_list[@]}, num_nodes: ${ctx_nodes}" - -rm -rf ${full_logdir}/hostnames -rm -rf ${full_logdir}/server_config.yaml - -gen_nodes_num_in_single_server=$((${gen_nodes} / ${num_gen_servers})) -ctx_nodes_num_in_single_server=$((${ctx_nodes} / ${num_ctx_servers})) -echo "gen_nodes_num_in_single_server: ${gen_nodes_num_in_single_server}" -echo "ctx_nodes_num_in_single_server: ${ctx_nodes_num_in_single_server}" - -# start the gen workers -echo "Starting gen workers..." -for i in $(seq 0 $((num_gen_servers - 1))); do - srun -l -N ${gen_nodes_num_in_single_server} \ - --ntasks=$((gen_world_size)) \ - --ntasks-per-node=${gpus_per_node} \ - --container-image=${container_image} \ - --container-name=${container_name} \ - --container-mounts=${container_mount} \ - --mpi=pmix \ - bash ${work_dir}/start_worker.sh \ - "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_profile_range}" "${gen_config_path}" "${worker_env_var}" \ - &> ${full_logdir}/output_gen_${i}.log & + # Use sed to replace the placeholder with the value in-place + sed -i "s|$placeholder|$current_val|g" "${start_worker_cmds_file}" + echo "Replaced $placeholder with $current_val" done -# start the ctx workers -echo "Starting ctx workers..." -for i in $(seq 0 $((num_ctx_servers - 1))); do - srun -l -N ${ctx_nodes_num_in_single_server} \ - --ntasks=$((ctx_world_size)) \ - --ntasks-per-node=${gpus_per_node} \ - --container-image=${container_image} \ - --container-name=${container_name} \ - --container-mounts=${container_mount} \ - --mpi=pmix \ - bash ${work_dir}/start_worker.sh \ - "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_profile_range}" "${ctx_config_path}" "${worker_env_var}" \ - &> ${full_logdir}/output_ctx_${i}.log & +echo "Starting worker commands from ${start_worker_cmds_file}..." +cat ${start_worker_cmds_file} | while read cmd; do + echo "Starting worker command: ${cmd}" + eval "${cmd}" done # start the server (in background) @@ -231,7 +173,7 @@ srun -l --container-name=${container_name} \ --container-mounts=${container_mount} \ --mpi=pmix --overlap -N 1 -n 1 \ bash ${work_dir}/start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} "${server_env_var}" \ - &> ${full_logdir}/output_server.log & + &> ${full_logdir}/4_output_server.log & # Wait for server to be ready (runs synchronously) echo "Waiting for server to be ready..." @@ -239,8 +181,8 @@ if ! srun -l --container-name=${container_name} \ --container-mounts=${container_mount} \ --mpi=pmix --overlap -N 1 -n 1 \ bash ${work_dir}/wait_server.sh ${full_logdir} \ - &> ${full_logdir}/wait_server.log; then - cleanup_on_failure "Server failed to become ready. Check ${full_logdir}/wait_server.log for details" + &> ${full_logdir}/5_wait_server.log; then + cleanup_on_failure "Server failed to become ready. Check ${full_logdir}/5_wait_server.log for details" fi echo "Server is ready!" @@ -253,8 +195,8 @@ if [ "${use_nv_sa_benchmark}" = "true" ]; then --mpi=pmix --overlap -N 1 -n 1 \ bash ${work_dir}/run_benchmark_nv_sa.sh \ "${model_path}" "${isl}" "${osl}" "${benchmark_ratio}" "${multi_round}" "${num_gen_servers}" "${concurrency_list}" "${streaming}" "${full_logdir}/" \ - &> ${full_logdir}/bench.log; then - cleanup_on_failure "NVIDIA SA benchmark failed. Check ${full_logdir}/bench.log for details" + &> ${full_logdir}/6_bench.log; then + cleanup_on_failure "NVIDIA SA benchmark failed. Check ${full_logdir}/6_bench.log for details" fi else echo "Using default benchmark script..." @@ -263,8 +205,8 @@ else --mpi=pmix --overlap -N 1 -n 1 \ bash ${work_dir}/run_benchmark.sh \ "${model_path}" "${dataset_file}" "${multi_round}" "${num_gen_servers}" "${concurrency_list}" "${streaming}" "${full_logdir}/" \ - &> ${full_logdir}/bench.log; then - cleanup_on_failure "Benchmark failed. Check ${full_logdir}/bench.log for details" + &> ${full_logdir}/6_bench.log; then + cleanup_on_failure "Benchmark failed. Check ${full_logdir}/6_bench.log for details" fi fi echo "Benchmark completed successfully" @@ -278,8 +220,8 @@ if [ "${enable_accuracy_test}" = "true" ]; then bash ${work_dir}/accuracy_eval.sh \ "${full_logdir}" "${accuracy_model}" "${accuracy_tasks}" "${model_path}" \ "${model_args_extra}" "${full_logdir}/accuracy_eval" \ - &> ${full_logdir}/accuracy_eval.log; then - cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/accuracy_eval.log for details" + &> ${full_logdir}/7_accuracy_eval.log; then + cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/7_accuracy_eval.log for details" fi echo "Accuracy evaluation completed successfully" fi diff --git a/examples/disaggregated/slurm/benchmark/gen_server_config.py b/examples/disaggregated/slurm/benchmark/gen_server_config.py index c427f5d42b..478d7bf22b 100644 --- a/examples/disaggregated/slurm/benchmark/gen_server_config.py +++ b/examples/disaggregated/slurm/benchmark/gen_server_config.py @@ -19,10 +19,6 @@ if __name__ == "__main__": type=str, default="logs", help="Work directory") - parser.add_argument("--worker_port", - type=int, - default=8336, - help="Worker port") parser.add_argument("--server_port", type=int, default=8333, @@ -49,21 +45,21 @@ if __name__ == "__main__": print(f"All hostnames found in {hostnames_folder}") # get the ctx and gen hostnames from the hostnames file - ctx_hostnames = [] - gen_hostnames = [] + ctx_urls = [] + gen_urls = [] for hostname_file in hostnames: hostname_file_path = os.path.join(hostnames_folder, hostname_file) with open(hostname_file_path, 'r') as f: - actual_hostname = f.read().strip() - print(f"Hostname: {actual_hostname} in {hostname_file}") + url = f.read().strip() + print(f"url: {url} in {hostname_file}") - if hostname_file.startswith("CTX"): - ctx_hostnames.append(actual_hostname) - elif hostname_file.startswith("GEN"): - gen_hostnames.append(actual_hostname) + if hostname_file.startswith("CTX"): + ctx_urls.append(url) + elif hostname_file.startswith("GEN"): + gen_urls.append(url) - print(f"ctx_hostnames: {ctx_hostnames}") - print(f"gen_hostnames: {gen_hostnames}") + print(f"ctx_urls: {ctx_urls}") + print(f"gen_urls: {gen_urls}") # get current hostname from env hostname = socket.gethostname() @@ -75,11 +71,11 @@ if __name__ == "__main__": 'backend': 'pytorch', 'context_servers': { 'num_instances': args.num_ctx_servers, - 'urls': [f'{host}:{args.worker_port}' for host in ctx_hostnames] + 'urls': ctx_urls }, 'generation_servers': { 'num_instances': args.num_gen_servers, - 'urls': [f'{host}:{args.worker_port}' for host in gen_hostnames] + 'urls': gen_urls } } diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh index a8576725c0..f51fccd6f0 100644 --- a/examples/disaggregated/slurm/benchmark/start_worker.sh +++ b/examples/disaggregated/slurm/benchmark/start_worker.sh @@ -43,8 +43,8 @@ echo "config_file: ${config_file}" # if SLURM_NODEID is 0, save the hostname to a file if [ "${SLURM_NODEID}" = "0" ]; then mkdir -p ${log_dir}/hostnames/ - echo $(hostname) > ${log_dir}/hostnames/${role}_${instance_id}.txt - echo "hostname saved to ${log_dir}/hostnames/${role}_${instance_id}.txt" + echo $(hostname):${port} > ${log_dir}/hostnames/${role}_${instance_id}.txt + echo "hostname:port saved to ${log_dir}/hostnames/${role}_${instance_id}.txt" fi nsys_prefix="" diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 4446a88285..d605c9fd59 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -2,11 +2,14 @@ import argparse import glob +import json +import math import os import shutil import subprocess import sys from datetime import datetime +from typing import Any, Dict, List import yaml @@ -47,7 +50,62 @@ def save_worker_config(config, output_path, worker_type): def calculate_nodes(world_size, num_servers, gpus_per_node): """Calculate required nodes based on world size and server count.""" - return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers + return math.ceil(world_size * num_servers / gpus_per_node) + + +def allocate_gpus( + total_nodes: int, + gpus_per_node: int, + num_gen_servers: int, + num_ctx_servers: int, + gen_world_size: int, + ctx_world_size: int, + base_port: int = 8000, +) -> List[Dict[str, Any]]: + allocations = [] + hostnames = [f"" for i in range(total_nodes)] + + global_gpu_cursor = 0 + + def get_gpu_location(gpus_per_node: int): + node_id = global_gpu_cursor // gpus_per_node + local_gpu_id = global_gpu_cursor % gpus_per_node + return node_id, local_gpu_id + + def assign_server(server_allocation: Dict[str, Any], world_size: int, + gpus_per_node: int): + nonlocal global_gpu_cursor + for _ in range(world_size): + node_id, gpu_id = get_gpu_location(gpus_per_node) + hostname = hostnames[node_id] + if hostname not in server_allocation["nodes"]: + server_allocation["nodes"][hostname] = [] + server_allocation["nodes"][hostname].append(gpu_id) + global_gpu_cursor += 1 + + def assign_servers( + server_allocations: List[Dict[str, Any]], + server_type: str, + num_servers: int, + world_size: int, + gpus_per_node: int, + ): + for i in range(num_servers): + server_allocation = { + "server_type": server_type, + "server_id": i, + "port": base_port + i, + "nodes": {}, + } + assign_server(server_allocation, world_size, gpus_per_node) + server_allocations.append(server_allocation) + + assign_servers(allocations, "GEN", num_gen_servers, gen_world_size, + gpus_per_node) + assign_servers(allocations, "CTX", num_ctx_servers, ctx_world_size, + gpus_per_node) + + return allocations def submit_job(config, log_dir): @@ -87,6 +145,7 @@ def submit_job(config, log_dir): # Get number of servers from config ctx_num = hw_config['num_ctx_servers'] gen_num = hw_config['num_gen_servers'] + gpus_per_node = hw_config['gpus_per_node'] # Get mtp_size from gen config's speculative_config gen_config = config['worker_config']['gen'] @@ -97,15 +156,13 @@ def submit_job(config, log_dir): ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size'] ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size'] ctx_world_size = ctx_tp_size * ctx_pp_size - ctx_nodes = calculate_nodes(ctx_world_size, ctx_num, - hw_config['gpus_per_node']) + ctx_nodes = calculate_nodes(ctx_world_size, ctx_num, gpus_per_node) gen_tp_size = config['worker_config']['gen']['tensor_parallel_size'] gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size'] gen_world_size = gen_tp_size * gen_pp_size - gen_nodes = calculate_nodes(gen_world_size, gen_num, - hw_config['gpus_per_node']) + gen_nodes = calculate_nodes(gen_world_size, gen_num, gpus_per_node) total_nodes = ctx_nodes + gen_nodes - total_tasks = total_nodes * hw_config['gpus_per_node'] + total_tasks = total_nodes * gpus_per_node # Generate log directory path based on configuration isl = config['benchmark']['input_length'] @@ -150,6 +207,69 @@ def submit_job(config, log_dir): save_worker_config(config, ctx_config_path, 'ctx') save_worker_config(config, gen_config_path, 'gen') + # Prepare allocation template + allocations = allocate_gpus( + total_nodes=total_nodes, + gpus_per_node=gpus_per_node, + num_gen_servers=gen_num, + num_ctx_servers=ctx_num, + gen_world_size=gen_world_size, + ctx_world_size=ctx_world_size, + ) + with open(os.path.join(log_dir, "allocations.json"), "w") as f: + json.dump(allocations, f, indent=2) + + # Generate start worker commands with placeholder hostnames + start_worker_cmds = [] + for allocation in allocations: + server_type = allocation["server_type"] + cuda_devices = ",".join( + [str(device) for device in list(allocation["nodes"].values())[0]]) + worker_env_var = env_config[ + 'worker_env_var'] + f" CUDA_VISIBLE_DEVICES={cuda_devices}" + cmd = [ + "srun", + "-l", + "--nodelist", + ",".join(allocation["nodes"].keys()), + "-N", + str(len(allocation["nodes"])), + "--ntasks", + str(gen_world_size) + if server_type == "GEN" else str(ctx_world_size), + "--ntasks-per-node", + str(gpus_per_node), + "--container-image", + env_config['container_image'], + "--container-name", + "disaggr-test", + "--container-mounts", + env_config['container_mount'], + "--mpi", + "pmix", + "--overlap", + "bash", + os.path.join(env_config['work_dir'], "start_worker.sh"), + server_type, + str(allocation["server_id"]), + env_config['model_path'], + str(allocation["port"]), + config['benchmark']['mode'], + config['benchmark']['concurrency_list'], + str(slurm_config['numa_bind']), + log_dir, + str(profiling_config['nsys_on']), + profiling_config['gen_profile_range'] + if server_type == "GEN" else profiling_config['ctx_profile_range'], + gen_config_path if server_type == "GEN" else ctx_config_path, + f'"{worker_env_var}"', + f"&> {log_dir}/3_output_{server_type}_{allocation['server_id']}.log &", + ] + start_worker_cmds.append(" ".join(cmd)) + + with open(os.path.join(log_dir, "start_worker_cmds.txt"), "w") as f: + f.write("\n".join(start_worker_cmds) + "\n") + # Prepare sbatch command # yapf: disable cmd = [ @@ -162,21 +282,14 @@ def submit_job(config, log_dir): f'--ntasks={total_tasks}', f'--ntasks-per-node={hw_config["gpus_per_node"]}', *([] if not slurm_config['set_segment'] else [f'--segment={total_nodes}']), + f'--output={log_dir}/slurm-%j.out', + f'--error={log_dir}/slurm-%j.err', *([arg for arg in slurm_config['extra_args'].split() if arg]), slurm_config['script_file'], - # Hardware configuration - '--gpus-per-node', str(hw_config['gpus_per_node']), - '--numa-bind', str(slurm_config['numa_bind']).lower(), - '--ctx-nodes', str(ctx_nodes), # Number of nodes needed for ctx workers - '--gen-nodes', str(gen_nodes), # Number of nodes needed for gen workers - '--ctx-world-size', str(ctx_world_size), # World size for ctx workers - '--gen-world-size', str(gen_world_size), # World size for gen workers # Worker configuration '--num-ctx-servers', str(ctx_num), - '--ctx-config-path', ctx_config_path, '--num-gen-servers', str(gen_num), - '--gen-config-path', gen_config_path, '--concurrency-list', config['benchmark']['concurrency_list'], # Sequence and benchmark parameters @@ -186,9 +299,6 @@ def submit_job(config, log_dir): '--benchmark-ratio', str(config['benchmark']['benchmark_ratio']), '--streaming', str(config['benchmark']['streaming']).lower(), '--use-nv-sa-benchmark', str(config['benchmark']['use_nv_sa_benchmark']).lower(), - '--benchmark-mode', config['benchmark']['mode'], - '--cache-max-tokens', str(config['worker_config']['gen']['cache_transceiver_config'] - ['max_tokens_in_buffer']), # Environment and paths '--dataset-file', config['benchmark']['dataset_file'], @@ -201,20 +311,12 @@ def submit_job(config, log_dir): '--build-wheel', str(env_config['build_wheel']).lower(), '--trtllm-wheel-path', env_config['trtllm_wheel_path'], - # Profiling - '--nsys-on', str(profiling_config['nsys_on']).lower(), - '--ctx-profile-range', profiling_config['ctx_profile_range'], - '--gen-profile-range', profiling_config['gen_profile_range'], - # Accuracy evaluation '--enable-accuracy-test', str(config['accuracy']['enable_accuracy_test']).lower(), '--accuracy-model', config['accuracy']['model'], '--accuracy-tasks', config['accuracy']['tasks'], '--model-args-extra', config['accuracy']['model_args_extra'], - # Worker environment variables - '--worker-env-var', env_config['worker_env_var'], - # Server environment variables '--server-env-var', env_config['server_env_var'] ] From fded6c393d527eab32814e3a9e7d9d1f5d5dbc7c Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:23:33 +0800 Subject: [PATCH 090/172] [TRTLLM-9262][test] add groupgemm ada case for rcca (#9833) Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 4 +++- tests/integration/test_lists/qa/llm_function_core.txt | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 9c3b105ecd..f4bb84ae63 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3411,7 +3411,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): task = GSM8K(self.MODEL_NAME) task.evaluate(llm, is_integration_test=True) - @skip_pre_hopper + @skip_pre_ada @parametrize_with_ids("torch_compile", [False, True]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", @@ -3419,6 +3419,8 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): ids=["latency"]) def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, torch_compile): + "RCCA: https://nvbugspro.nvidia.com/bug/5284463" + "Need to check Ada support" torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph, diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 7facb002eb..47a017378c 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -511,6 +511,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] From 9d1f2a9925177b58d42333737b7249523a3c8116 Mon Sep 17 00:00:00 2001 From: Yifei Wang Date: Thu, 11 Dec 2025 21:33:22 -0800 Subject: [PATCH 091/172] [#6425][fix] address CUDA stream sync issue in ModelRunnerCPP (#6426) Signed-off-by: yifei.w --- tensorrt_llm/runtime/model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py index 308af9b012..6385f804dd 100644 --- a/tensorrt_llm/runtime/model_runner.py +++ b/tensorrt_llm/runtime/model_runner.py @@ -473,6 +473,7 @@ class ModelRunnerMixin: prompt_table, torch.Tensor), "Prompt table should be str or torch.Tensor" prompt_table_data = prompt_table.to(dtype=self.dtype) + torch.cuda.current_stream().synchronize() return prompt_table_data From eeb03f314aefbd73feb1681d34028c338d16a7a0 Mon Sep 17 00:00:00 2001 From: yuanjingx87 <197832395+yuanjingx87@users.noreply.github.com> Date: Thu, 11 Dec 2025 22:46:14 -0800 Subject: [PATCH 092/172] [None][infra] Replace the deprecated github token (#9915) Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> --- jenkins/GenerateLock.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jenkins/GenerateLock.groovy b/jenkins/GenerateLock.groovy index 0e2dd89ed1..f059cf69f2 100644 --- a/jenkins/GenerateLock.groovy +++ b/jenkins/GenerateLock.groovy @@ -42,7 +42,7 @@ def getGitCredentialId (String repoUrlKey) { if (repoUrlKey == "tensorrt_llm_internal") { return 'svc_tensorrt_gitlab_api_token_no_username_as_string' } else { - return 'github-token-trtllm-ci' + return 'github-cred-trtllm-ci' } } From 711016c799dab7a654c6ef4933c469a470a9a2ab Mon Sep 17 00:00:00 2001 From: Yihan Wang Date: Fri, 12 Dec 2025 15:15:13 +0800 Subject: [PATCH 093/172] [https://nvbugs/5736923][infra] Waive timeout disaggregated/test_auto_scaling[http-round_robin] test (#9942) Signed-off-by: Yihan Wang --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 769d371651..57aa9182d0 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -419,6 +419,7 @@ unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_ disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5726066) +disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] SKIP (https://nvbugs/5736923) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) From fd3d3a553d5b12f3d1ba8349772da07fb0b080d6 Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 15:55:22 +0800 Subject: [PATCH 094/172] [None][chore] Modify python ipc_util to align with C++ path (#9894) Signed-off-by: yufeiwu <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: ruodil <200874449+ruodil@users.noreply.github.com> --- tensorrt_llm/_ipc_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_ipc_utils.py b/tensorrt_llm/_ipc_utils.py index 4751b060e1..1d8e911633 100644 --- a/tensorrt_llm/_ipc_utils.py +++ b/tensorrt_llm/_ipc_utils.py @@ -47,7 +47,9 @@ def can_access_peer(mapping: Mapping) -> bool: # Early exit if devices are on different nodes if mapping.get_node_rank(rank) != mapping.node_rank: - logger.info(f"Detect inter-node TP between rank {mapping.rank} and rank {rank}") + logger.info( + f"Detect inter-node TP between rank {mapping.rank} and rank {rank}, fail to access peer GPU memory" + ) return False # Skip if same device From 2fc94e5dd75694d7d86fc36a3f6dff8214b3faf8 Mon Sep 17 00:00:00 2001 From: kris1025 <34030136+kris1025@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:30:09 +0800 Subject: [PATCH 095/172] [None][chore] unwaive qwen3 accuracy test (#9895) Signed-off-by: linquanh --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 57aa9182d0..3822ae6d53 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -412,7 +412,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5715568) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) From 61745f034a8305e2de25d0af094992cb0b19105e Mon Sep 17 00:00:00 2001 From: chenfeiz0326 Date: Fri, 12 Dec 2025 17:16:50 +0800 Subject: [PATCH 096/172] [https://nvbugs/5727481][ci] Fix Port Conflict in Perf-Sanity CI Test (#9896) Signed-off-by: Chenfei Zhang --- .../defs/perf/open_search_db_utils.py | 60 +++++-- tests/integration/defs/perf/test_perf.py | 56 ++---- tests/integration/defs/perf/utils.py | 164 +++++++++++------- .../l0_gb200_multi_gpus_perf_sanity.yml | 3 - tests/integration/test_lists/waives.txt | 3 - tests/scripts/perf-sanity/l0_dgx_b200.yaml | 18 +- tests/scripts/perf-sanity/l0_dgx_b300.yaml | 12 +- .../perf-sanity/l0_gb200_multi_gpus.yaml | 30 ++-- .../perf-sanity/l0_gb200_multi_nodes.yaml | 6 +- 9 files changed, 198 insertions(+), 154 deletions(-) diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index 434af387a5..5824670d6f 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -22,7 +22,8 @@ import sys import time from datetime import datetime -from defs.trt_test_alternative import print_info +import yaml +from defs.trt_test_alternative import print_info, print_warning _project_root = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../../..')) @@ -337,11 +338,38 @@ def get_history_data(new_data_dict, gpu_type, match_keys): def get_latest_data(data_list): if not data_list: return None - time_format = "%b %d, %Y @ %H:%M:%S.%f" - # Find the item with the maximum ts_created value - latest_data = max( - data_list, - key=lambda x: datetime.strptime(x["ts_created"], time_format)) + + # Supported timestamp formats + time_formats = [ + "%Y-%m-%dT%H:%M:%S.%fZ", # ISO 8601: 2025-12-11T06:25:25.338Z + "%Y-%m-%dT%H:%M:%SZ", # ISO 8601 without ms: 2025-12-11T06:25:25Z + "%Y-%m-%dT%H:%M:%S.%f", # ISO 8601 without Z: 2025-12-11T06:25:25.338 + "%Y-%m-%dT%H:%M:%S", # ISO 8601 basic: 2025-12-11T06:25:25 + "%b %d, %Y @ %H:%M:%S.%f", # OpenSearch format: Dec 11, 2025 @ 06:25:25.338 + ] + + def parse_timestamp(timestamp): + if isinstance(timestamp, (int, float)): + # Handle milliseconds timestamp + if timestamp > 1e12: + timestamp = timestamp / 1000 + return datetime.fromtimestamp(timestamp) + if isinstance(timestamp, datetime): + return timestamp + + timestamp_str = str(timestamp) + for fmt in time_formats: + try: + return datetime.strptime(timestamp_str, fmt) + except ValueError: + continue + + print_warning(f"Unable to parse timestamp: {timestamp_str}") + return datetime.fromtimestamp(0) + + # Find the item with the maximum @timestamp value + latest_data = max(data_list, + key=lambda x: parse_timestamp(x.get("@timestamp", 0))) return latest_data history_baseline_dict = {} @@ -494,10 +522,20 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict, print_info(f"Fail to post data to {TEST_INFO_PROJECT_NAME}, error: {e}") -def print_regressive_test_cases(regressive_data_list): +def write_regressive_test_cases(regressive_data_list, new_data_dict, + perf_result_output_dir): """ - Print regressive test cases + Write regressive test cases to regressive.yaml """ - print_info(f"Found {len(regressive_data_list)} regressive test cases") - for data in regressive_data_list: - print_info(f"Regressive test case: {data}") + regression_yaml_path = os.path.join(perf_result_output_dir, + "regression.yaml") + with open(regression_yaml_path, 'w') as f: + yaml.dump(regressive_data_list, f, default_flow_style=False) + + perf_data_yaml_path = os.path.join(perf_result_output_dir, "perf_data.yaml") + with open(perf_data_yaml_path, 'w') as f: + yaml.dump(list(new_data_dict.values()), f, default_flow_style=False) + + if len(regressive_data_list) > 0: + print_warning( + f"Found {len(regressive_data_list)} regressive test cases") diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index def4cf505b..f2b3cc68df 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -32,7 +32,7 @@ from ..conftest import get_llm_root, llm_models_root, trt_environment from .open_search_db_utils import (add_id, get_history_data, get_job_info, post_new_perf_data, prepare_baseline_data, prepare_regressive_test_cases, - print_regressive_test_cases) + write_regressive_test_cases) from .pytorch_model_config import get_model_yaml_config from .sampler_options_config import get_sampler_options_config from .utils import (AbstractPerfScriptTestClass, PerfAggrScriptTestCmds, @@ -605,17 +605,11 @@ class ServerConfig: def to_cmd(self, output_dir: str, numa_bind: bool = False, - disagg_serving_type: str = "", - hostname: str = "localhost", - port: int = 8000) -> List[str]: + disagg_serving_type: str = "") -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name config_filename = f"extra-llm-api-config.{self.name}.yml" - if "CTX" in disagg_serving_type: - config_filename = f"extra-llm-api-config.{self.name}.ctx.yml" - elif "GEN" in disagg_serving_type: - config_filename = f"extra-llm-api-config.{self.name}.gen.yml" config_path = os.path.join(output_dir, config_filename) numa_bind_cmd = [] @@ -623,9 +617,8 @@ class ServerConfig: numa_bind_cmd = ["numactl", "-m 0,1"] cmd = numa_bind_cmd + [ - "trtllm-serve", self.model_path, "--host", hostname, "--port", - str(port), "--backend", "pytorch", "--extra_llm_api_options", - config_path + "trtllm-serve", self.model_path, "--backend", "pytorch", + "--extra_llm_api_options", config_path ] return cmd @@ -759,7 +752,7 @@ class ClientConfig: self.model_path = "" self.env_vars = env_vars - def to_cmd(self, need_hostname: bool = True) -> List[str]: + def to_cmd(self) -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name @@ -775,9 +768,6 @@ class ClientConfig: "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency", str(self.concurrency) ] - if need_hostname: - hostname_port = ["--host", "localhost", "--port", "8000"] - benchmark_cmd.extend(hostname_port) if self.backend: benchmark_cmd.append("--backend") benchmark_cmd.append(self.backend) @@ -949,7 +939,7 @@ def parse_multi_node_disagg_config_file(config_file_path: str, # Create ctx_server config data ctx_server_config_data = { - 'name': 'ctx_server', + 'name': 'ctx', 'model_name': model_name, 'gpus': hardware.get('gpus_per_ctx_server'), 'gpus_per_node': hardware.get('gpus_per_node'), @@ -958,7 +948,7 @@ def parse_multi_node_disagg_config_file(config_file_path: str, # Create gen_server config data gen_server_config_data = { - 'name': 'gen_server', + 'name': 'gen', 'model_name': model_name, 'gpus': hardware.get('gpus_per_gen_server'), 'gpus_per_node': hardware.get('gpus_per_node'), @@ -1749,7 +1739,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): for client_config in client_configs: server_cmds.append(server_cmd) server_envs.append(server_env) - client_cmd = client_config.to_cmd(need_hostname=True) + client_cmd = client_config.to_cmd() client_env = client_config.to_env() client_cmds.append(client_cmd) client_envs.append(client_env) @@ -1765,14 +1755,10 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): disagg_server_envs = [] benchmark_cmds = [] benchmark_envs = [] - # Create hostnames directory - hostnames_dir = os.path.join(output_dir, "hostnames") - if not os.path.exists(hostnames_dir): - os.makedirs(hostnames_dir, exist_ok=True) - + cmd_idx = 0 for disagg_config in self._config.disagg_configs: disagg_serving_type = disagg_config['disagg_serving_type'] - hostname = disagg_config['hostname'] + disagg_config['hostname'] numa_bind = disagg_config['numa_bind'] ctx_server_cmd = None ctx_server_env = None @@ -1783,18 +1769,11 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): benchmark_cmd = None benchmark_env = None if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type: - # Write hostname to hostnames folder - hostname_file = os.path.join(hostnames_dir, - f"{disagg_serving_type}.txt") - with open(hostname_file, 'w') as f: - f.write(hostname) - # Generate CTX or GEN server commands if this is a CTX or GEN node is_ctx = "CTX" in disagg_serving_type server_config = disagg_config[ 'ctx_server'] if is_ctx else disagg_config['gen_server'] server_cmd = server_config.to_cmd(output_dir, numa_bind, - disagg_serving_type, hostname, - 8336) + disagg_serving_type) server_env = server_config.to_env() if is_ctx: ctx_server_cmd = server_cmd @@ -1804,7 +1783,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): gen_server_env = server_env # Generate extra-llm-api-config.yml config_content = server_config.generate_extra_llm_api_config() - config_filename = f"extra-llm-api-config.{server_config.name}.{'ctx' if is_ctx else 'gen'}.yml" + config_filename = f"extra-llm-api-config.{server_config.name}.yml" config_path = os.path.join(output_dir, config_filename) with open(config_path, 'w') as f: f.write(config_content) @@ -1813,15 +1792,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): # Generate DISAGG server command if this is the DISAGG server node disagg_server_cmd = [ "trtllm-serve", "disaggregated", "-c", - f"{output_dir}/server_config.yaml", "-t", + f"{output_dir}/server_config.{cmd_idx}.yaml", "-t", str(timeout), "-r", str(timeout) ] disagg_server_env = to_env_dict(disagg_config['server_env_var']) elif "BENCHMARK" in disagg_serving_type: # Generate benchmark command if this is the BENCHMARK server node - benchmark_cmd = disagg_config['client'].to_cmd( - need_hostname=False) + benchmark_cmd = disagg_config['client'].to_cmd() benchmark_env = disagg_config['client'].to_env() ctx_server_cmds.append(ctx_server_cmd) ctx_server_envs.append(ctx_server_env) @@ -1831,6 +1809,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): disagg_server_envs.append(disagg_server_env) benchmark_cmds.append(benchmark_cmd) benchmark_envs.append(benchmark_env) + cmd_idx += 1 return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list: @@ -2542,7 +2521,10 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): post_new_perf_data(new_baseline_data_dict, new_data_dict, regressive_data_list) - print_regressive_test_cases(regressive_data_list) + perf_result_output_dir = os.path.join(self._output_dir, + self._test_param_labels) + write_regressive_test_cases(regressive_data_list, new_data_dict, + perf_result_output_dir) def _get_engine_dir(self) -> str: """ diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py index 2dcb5bf74f..a52ec84989 100644 --- a/tests/integration/defs/perf/utils.py +++ b/tests/integration/defs/perf/utils.py @@ -32,6 +32,8 @@ from _pytest.python import Function from defs.trt_test_alternative import (check_output, popen, print_error, print_info) +from tensorrt_llm._utils import get_free_port + from ..common import get_trt_llm_lib_dir, venv_mpi_check_output from ..local_venv import PythonVenvRunnerImpl from ..test_list_parser import parse_test_list @@ -129,6 +131,10 @@ def temp_wd(path): os.chdir(prev_cwd) +def add_host_port_to_cmd(cmd: List[str], host: str, port: int) -> List[str]: + return cmd + ["--host", host, "--port", str(port)] + + class PerfBenchScriptTestCmds(NamedTuple): data_cmds: List[List[str]] build_cmd: List[str] @@ -276,31 +282,29 @@ class PerfAggrScriptTestCmds(NamedTuple): client_file_path = os.path.join( self.output_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log") try: - server_envs = copy.deepcopy(os.environ) - # server_envs.update(self.server_envs[cmd_idx]) - print_info( - f"Starting server. cmd is {self.server_cmds[cmd_idx]} envs are {server_envs}" - ) + server_hostname = "localhost" + server_port = get_free_port() + server_cmd = add_host_port_to_cmd(self.server_cmds[cmd_idx], + server_hostname, server_port) + print_info(f"Starting server. cmd is {server_cmd}") with open(server_file_path, 'w') as server_ctx: server_proc = subprocess.Popen( - self.server_cmds[cmd_idx], + server_cmd, stdout=server_ctx, stderr=subprocess.STDOUT, - env=server_envs, + env=copy.deepcopy(os.environ), ) - self.wait_for_endpoint_ready("http://localhost:8000/health", - timeout=self.timeout) - client_envs = copy.deepcopy(os.environ) - # client_envs.update(self.client_envs[cmd_idx]) - print_info( - f"Starting client. cmd is {self.client_cmds[cmd_idx]} envs are {client_envs}" - ) + self.wait_for_endpoint_ready( + f"http://{server_hostname}:{server_port}/health", + timeout=self.timeout) + client_cmd = add_host_port_to_cmd(self.client_cmds[cmd_idx], + server_hostname, server_port) + print_info(f"Starting client. cmd is {client_cmd}") output = subprocess.check_output( - self.client_cmds[cmd_idx], - env=client_envs, + client_cmd, stderr=subprocess.STDOUT, + env=copy.deepcopy(os.environ), ).decode() - with open(client_file_path, 'w') as client_ctx: client_ctx.write(output) finally: @@ -390,16 +394,21 @@ class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): num_gen_servers: int output_dir: str - def _generate_disagg_server_config(self, - cmd_idx: int, - ctx_gen_port: int = 8336, - disagg_server_port: int = 8333) -> str: + def _generate_hostname_file(self, cmd_idx: int, port: int): + # Create hostnames directory + hostnames_dir = os.path.join(self.output_dir, f"hostnames-{cmd_idx}") + if not os.path.exists(hostnames_dir): + os.makedirs(hostnames_dir, exist_ok=True) + hostname_file = os.path.join(hostnames_dir, + f"{self.disagg_serving_type}.txt") + with open(hostname_file, 'w') as f: + f.write(f"{self.hostname}:{port}") + + def _generate_disagg_server_config(self, cmd_idx: int, + disagg_server_port: int) -> str: print_info( f"Generating disagg server config for command index {cmd_idx}") - # Wait for all hostname files to be created - hostnames_folder = os.path.join(self.output_dir, "hostnames") - print_info(f"Waiting for hostnames folder: {hostnames_folder}") - + hostnames_folder = os.path.join(self.output_dir, f"hostnames-{cmd_idx}") expected_count = self.num_ctx_servers + self.num_gen_servers start_time = time.time() hostnames = [] @@ -428,40 +437,40 @@ class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): for hostname_file in hostnames: hostname_file_path = os.path.join(hostnames_folder, hostname_file) with open(hostname_file_path, 'r') as f: - actual_hostname = f.read().strip() - print_info(f"Hostname: {actual_hostname} in {hostname_file}") + hostname_port = f.read().strip() + hostname = hostname_port.split(":")[0] + port = hostname_port.split(":")[1] + print_info( + f"Hostname File: {hostname_file_path} Hostname: {hostname_port} Port: {port}" + ) if hostname_file.startswith("CTX"): - ctx_hostnames.append(actual_hostname) + ctx_hostnames.append(hostname_port) elif hostname_file.startswith("GEN"): - gen_hostnames.append(actual_hostname) - print_info(f"ctx_hostnames: {ctx_hostnames}") - print_info(f"gen_hostnames: {gen_hostnames}") + gen_hostnames.append(hostname_port) - # Generate server config server_config = { 'hostname': self.hostname, 'port': disagg_server_port, 'backend': 'pytorch', 'context_servers': { 'num_instances': self.num_ctx_servers, - 'urls': [f'{host}:{ctx_gen_port}' for host in ctx_hostnames] + 'urls': ctx_hostnames, }, 'generation_servers': { 'num_instances': self.num_gen_servers, - 'urls': [f'{host}:{ctx_gen_port}' for host in gen_hostnames] + 'urls': gen_hostnames, } } - - config_path = os.path.join(self.output_dir, "server_config.yaml") + config_path = os.path.join(self.output_dir, + f"server_config.{cmd_idx}.yaml") with open(config_path, 'w') as f: yaml.dump(server_config, f) print_info(f"Server config file {config_path} generated") - return config_path - def _get_disagg_server_hostname_and_port(self) -> tuple: - config_path = os.path.join(self.output_dir, "server_config.yaml") - print_info(f"Waiting for server config file: {config_path}") + def _get_disagg_server_hostname_and_port(self, cmd_idx: int) -> tuple: + config_path = os.path.join(self.output_dir, + f"server_config.{cmd_idx}.yaml") start_time = time.time() while True: if os.path.exists(config_path): @@ -481,15 +490,12 @@ class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): with open(config_path, 'r') as f: server_config = yaml.safe_load(f) disagg_server_hostname = server_config['hostname'] - disagg_server_port = str(server_config['port']) + disagg_server_port = server_config['port'] return disagg_server_hostname, disagg_server_port def wait_for_benchmark_ready(self, benchmark_status_file: str, timeout: int = 7200): - print_info( - f"Server {self.disagg_serving_type} waiting for benchmark status file: {benchmark_status_file}" - ) start_time = time.time() while True: if os.path.exists(benchmark_status_file): @@ -536,26 +542,26 @@ class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): server_proc = None benchmark_status_file = os.path.join(self.output_dir, f"benchmark_status.{cmd_idx}.txt") + port = get_free_port() if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type: + self._generate_hostname_file(cmd_idx, port) server_file_path = os.path.join( self.output_dir, f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log") is_ctx = "CTX" in self.disagg_serving_type server_cmd = self.ctx_server_cmds[ cmd_idx] if is_ctx else self.gen_server_cmds[cmd_idx] - server_envs = copy.deepcopy(os.environ) - # server_envs.update(self.ctx_server_envs[cmd_idx] - # if is_ctx else self.gen_server_envs[cmd_idx]) + server_cmd = add_host_port_to_cmd(server_cmd, self.hostname, port) try: print_info( - f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd} envs are {server_envs}" + f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd}" ) with open(server_file_path, 'w') as server_ctx: server_proc = subprocess.Popen( server_cmd, stdout=server_ctx, stderr=subprocess.STDOUT, - env=server_envs, + env=copy.deepcopy(os.environ), ) self.wait_for_benchmark_ready(benchmark_status_file, timeout=self.timeout) @@ -568,20 +574,17 @@ class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): self.output_dir, f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log") disagg_server_cmd = self.disagg_server_cmds[cmd_idx] - disagg_server_envs = copy.deepcopy(os.environ) - # disagg_server_envs.update(self.disagg_server_envs[cmd_idx]) try: - # Generate disagg server config (this will wait for all hostnames) - self._generate_disagg_server_config(cmd_idx) + self._generate_disagg_server_config(cmd_idx, port) print_info( - f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd} envs are {disagg_server_envs}" + f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd}" ) with open(disagg_server_file_path, 'w') as disagg_server_ctx: disagg_server_proc = subprocess.Popen( disagg_server_cmd, stdout=disagg_server_ctx, stderr=subprocess.STDOUT, - env=disagg_server_envs, + env=copy.deepcopy(os.environ), ) self.wait_for_benchmark_ready(benchmark_status_file, timeout=self.timeout) @@ -593,26 +596,21 @@ class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): benchmark_file_path = os.path.join( self.output_dir, f"trtllm-benchmark.{cmd_idx}.log") try: - # Get disagg server's hostname and port disagg_server_hostname, disagg_server_port = self._get_disagg_server_hostname_and_port( - ) - # Add hostname and port to benchmark command - benchmark_cmd = self.benchmark_cmds[cmd_idx] + [ - '--host', disagg_server_hostname, '--port', - disagg_server_port - ] - benchmark_envs = copy.deepcopy(os.environ) - # benchmark_envs.update(self.benchmark_envs[cmd_idx]) + cmd_idx) + benchmark_cmd = add_host_port_to_cmd( + self.benchmark_cmds[cmd_idx], disagg_server_hostname, + disagg_server_port) self.wait_for_endpoint_ready( f"http://{disagg_server_hostname}:{disagg_server_port}/health", timeout=self.timeout, ) - # Run benchmark print_info( - f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd} envs are {benchmark_envs}" + f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd}" ) output = subprocess.check_output( - benchmark_cmd, env=benchmark_envs, + benchmark_cmd, + env=copy.deepcopy(os.environ), stderr=subprocess.STDOUT).decode() with open(benchmark_file_path, 'w') as benchmark_ctx: benchmark_ctx.write(output) @@ -702,6 +700,34 @@ class AbstractPerfScriptTestClass(abc.ABC): """ return self._error + def _check_benchmark_output_for_errors(self, output: str) -> None: + """ + Check whether the benchmark output contains error messages (e.g., failed requests). + """ + if not output: + return + + # Check for non-zero failed requests + failed_requests_match = re.search(r'Failed requests:\s+(\d+)', output) + if failed_requests_match: + failed_count = int(failed_requests_match.group(1)) + if failed_count > 0: + self._result_state = "failed" + self._error = Exception( + f"Benchmark has {failed_count} failed requests") + print_error( + f"Benchmark output contains {failed_count} failed requests. Marking test as failed." + ) + return + + # Check for explicit failure markers + if "!FAILED REQUESTS!" in output or "!CHECK LOG FOR ERRORS!" in output: + self._result_state = "failed" + self._error = Exception("Benchmark output contains failure markers") + print_error( + "Benchmark output contains failure markers. Marking test as failed." + ) + def run_ex(self, full_test_name: str, metric_type: PerfMetricType, @@ -763,6 +789,10 @@ class AbstractPerfScriptTestClass(abc.ABC): # if not is_prepare_dataset_cmd: print(collect_and_clean_myelin_time(output)) + # Check whether output has error message + if not is_prepare_dataset_cmd and is_perf_sanity_test: + self._check_benchmark_output_for_errors(output) + # Print the output log to stdout and cache it. if is_prepare_dataset_cmd: # For prepare_dataset commands, only print the prepare command info diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml index 23f4b20f97..fcbe711760 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -17,6 +17,3 @@ l0_gb200_multi_gpus_perf_sanity: - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k] - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k] - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k] - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_8k1k] - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_8k1k] - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_8k1k] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 3822ae6d53..10795f404a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -330,8 +330,6 @@ accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] SKIP (https://nvbugs/5629136) -perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200] SKIP (https://nvbugs/5643646) -perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b300] SKIP (https://nvbugs/5643646) unittest/bindings/test_hostfunc.py::test_hostfunc SKIP (https://nvbugs/5643631) examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) @@ -427,5 +425,4 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_mode disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_streaming SKIP (https://nvbugs/5720482) unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) -perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k] SKIP (https://nvbugs/5727481) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475) diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml index 17679d4ac8..3074bef6c1 100644 --- a/tests/scripts/perf-sanity/l0_dgx_b200.yaml +++ b/tests/scripts/perf-sanity/l0_dgx_b200.yaml @@ -31,7 +31,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp8_tep8_mtp3_1k1k" @@ -62,7 +62,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp8_tp8_mtp3_1k1k" @@ -93,7 +93,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_dep4_mtp1_1k1k" @@ -128,7 +128,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tep4_mtp3_1k1k" @@ -159,7 +159,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tp4_mtp3_1k1k" @@ -190,7 +190,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "gpt_oss_fp4_dep2_1k1k" @@ -222,7 +222,7 @@ server_configs: iterations: 5 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "gpt_oss_fp4_dep4_1k1k" @@ -254,7 +254,7 @@ server_configs: iterations: 5 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "gpt_oss_fp4_tp4_eagle3_1k1k" @@ -289,5 +289,5 @@ server_configs: iterations: 32 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_dgx_b300.yaml b/tests/scripts/perf-sanity/l0_dgx_b300.yaml index b19ca77812..0306ad25a8 100644 --- a/tests/scripts/perf-sanity/l0_dgx_b300.yaml +++ b/tests/scripts/perf-sanity/l0_dgx_b300.yaml @@ -31,7 +31,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp8_tep8_mtp3_1k1k" @@ -62,7 +62,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp8_tp8_mtp3_1k1k" @@ -93,7 +93,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_dep4_mtp1_1k1k" @@ -128,7 +128,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tep4_mtp3_1k1k" @@ -159,7 +159,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tp4_mtp3_1k1k" @@ -190,5 +190,5 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml index 8e8efc1bc3..ab14148b20 100644 --- a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml +++ b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml @@ -32,7 +32,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tep4_mtp3_1k1k" @@ -63,7 +63,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tp4_mtp3_1k1k" @@ -94,7 +94,7 @@ server_configs: iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" # 8k1k configs @@ -105,7 +105,7 @@ server_configs: moe_expert_parallel_size: 4 pipeline_parallel_size: 1 max_batch_size: 512 - max_num_tokens: 10304 + max_num_tokens: 12288 attn_backend: "TRTLLM" enable_attention_dp: true attention_dp_config: @@ -130,7 +130,7 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tep4_mtp3_8k1k" @@ -140,7 +140,7 @@ server_configs: moe_expert_parallel_size: 4 pipeline_parallel_size: 1 max_batch_size: 32 - max_num_tokens: 10304 + max_num_tokens: 12288 attn_backend: "TRTLLM" enable_attention_dp: false moe_config: @@ -161,7 +161,7 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tp4_mtp3_8k1k" @@ -171,7 +171,7 @@ server_configs: moe_expert_parallel_size: 1 pipeline_parallel_size: 1 max_batch_size: 4 - max_num_tokens: 10304 + max_num_tokens: 12288 attn_backend: "TRTLLM" enable_attention_dp: false moe_config: @@ -192,7 +192,7 @@ server_configs: iterations: 10 isl: 8192 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" # 1k8k configs @@ -203,7 +203,7 @@ server_configs: moe_expert_parallel_size: 4 pipeline_parallel_size: 1 max_batch_size: 512 - max_num_tokens: 8192 + max_num_tokens: 12288 attn_backend: "TRTLLM" enable_attention_dp: true attention_dp_config: @@ -228,7 +228,7 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tep4_mtp3_1k8k" @@ -238,7 +238,7 @@ server_configs: moe_expert_parallel_size: 4 pipeline_parallel_size: 1 max_batch_size: 32 - max_num_tokens: 8192 + max_num_tokens: 12288 attn_backend: "TRTLLM" enable_attention_dp: false moe_config: @@ -259,7 +259,7 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tp4_mtp3_1k8k" @@ -269,7 +269,7 @@ server_configs: moe_expert_parallel_size: 1 pipeline_parallel_size: 1 max_batch_size: 4 - max_num_tokens: 8192 + max_num_tokens: 12288 attn_backend: "TRTLLM" enable_attention_dp: false moe_config: @@ -290,5 +290,5 @@ server_configs: iterations: 10 isl: 1024 osl: 8192 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml index 3dcdc83684..432c6ee145 100644 --- a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml +++ b/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml @@ -13,7 +13,7 @@ server_configs: moe_expert_parallel_size: 8 pipeline_parallel_size: 1 max_batch_size: 512 - max_num_tokens: 2112 + max_num_tokens: 3136 attn_backend: "TRTLLM" enable_attention_dp: true attention_dp_config: @@ -35,7 +35,7 @@ server_configs: iterations: 12 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" - name: "r1_fp4_v2_tep8_mtp3" @@ -67,5 +67,5 @@ server_configs: iterations: 12 isl: 1024 osl: 1024 - random_range_ratio: 0.8 + random_range_ratio: 0.2 backend: "openai" From 9b3e5e90ee5043421af27035bdc1be5b9cfa84b7 Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Fri, 12 Dec 2025 17:35:55 +0800 Subject: [PATCH 097/172] [None][test] fix a typo in model name in script (#9867) Signed-off-by: Ruodi Lu Co-authored-by: Ruodi Lu --- tests/integration/defs/perf/test_perf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index f2b3cc68df..7041861f91 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -113,7 +113,8 @@ MODEL_PATH_DICT = { "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", "starcoder2_3b": "starcoder2-3b", - "starcoder_15b": "starcoder2-15b", + "starcoder2_7b": "starcoder2-7b", + "starcoder2_15b": "starcoder2-15b", "t5": "t5-small", # not supported for trtllm-bench build config "flan_t5_base": "flan-t5-small", # not supported for trtllm-bench build config From a6263a127fd24246f76cbceed1732a14d0ca0a42 Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Fri, 12 Dec 2025 18:53:54 +0800 Subject: [PATCH 098/172] [None][chore] Degrade log level in cublas fp4 runner when using default configs (#9951) Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp index 1e8436df28..02eae46d74 100644 --- a/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp @@ -230,7 +230,7 @@ public: else { // Fall back to default (no algorithm specified) - TLLM_LOG_WARNING( + TLLM_LOG_DEBUG( "CublasLtFP4GemmRunner: No valid algorithm found (tactic=%ld, available=%zu), falling back to default " "for shape (m=%d, n=%d, k=%d)", tactic, cache.heuristics.size(), m, n, k); From e767fc649a0df77873ef39722f7d528dc8d32179 Mon Sep 17 00:00:00 2001 From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> Date: Fri, 12 Dec 2025 07:14:14 -0500 Subject: [PATCH 099/172] [None][feat] AutoDeploy: prepare_metadata revisited (#9764) Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> --- examples/auto_deploy/nano_v3.yaml | 3 +- .../compile/backends/torch_cudagraph.py | 146 +--- .../_torch/auto_deploy/compile/compiler.py | 15 +- .../_torch/auto_deploy/config/default.yaml | 2 - .../custom_ops/attention_interface.py | 730 +++++++++++++----- .../custom_ops/fla/fla_backend_delta.py | 146 +--- .../custom_ops/flashinfer_attention.py | 128 ++- .../mamba/cuda_backend_causal_conv.py | 160 +--- .../mamba/torch_backend_causal_conv.py | 94 +-- .../custom_ops/mamba/torch_backend_mamba.py | 102 +-- .../custom_ops/mamba/triton_backend_mamba.py | 209 ++--- .../_torch/auto_deploy/custom_ops/mla.py | 68 +- .../custom_ops/torch_backend_attention.py | 71 +- .../custom_ops/triton_attention.py | 88 +-- .../auto_deploy/custom_ops/triton_utils.py | 86 +++ tensorrt_llm/_torch/auto_deploy/llm_args.py | 19 + .../_torch/auto_deploy/models/factory.py | 5 - tensorrt_llm/_torch/auto_deploy/models/hf.py | 7 - .../auto_deploy/models/patches/bamba.py | 23 +- .../_torch/auto_deploy/shim/ad_executor.py | 216 +++++- .../_torch/auto_deploy/shim/demollm.py | 14 +- .../transform/library/compile_model.py | 24 +- .../auto_deploy/transform/library/kvcache.py | 145 ++-- .../transform/library/kvcache_transformers.py | 43 +- .../_torch/auto_deploy/utils/node_utils.py | 8 +- .../_utils_test/_graph_test_helpers.py | 44 +- .../_utils_test/torch_attention_reference.py | 41 +- .../multigpu/test_ad_build_small_multi.py | 4 +- .../singlegpu/compile/test_captured_graph.py | 15 +- .../unit/singlegpu/compile/test_compiler.py | 21 +- .../compile/test_cuda_graph_batch_sizes.py | 107 +-- .../singlegpu/custom_ops/test_attention_op.py | 13 +- .../test_cuda_causal_conv_cached_op.py | 34 +- .../test_flashinfer_attention_op.py | 57 +- .../custom_ops/test_torch_attention_op.py | 37 +- .../test_torch_causal_conv_cached_op.py | 45 +- .../custom_ops/test_torch_mamba_cached_op.py | 45 +- .../custom_ops/test_triton_mamba_cached_op.py | 20 +- .../triton_kernels/test_triton_utils.py | 162 ++++ .../singlegpu/test_ad_build_small_single.py | 6 +- .../transformations/library/test_kv_cache.py | 75 +- 41 files changed, 1864 insertions(+), 1414 deletions(-) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py create mode 100644 tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py diff --git a/examples/auto_deploy/nano_v3.yaml b/examples/auto_deploy/nano_v3.yaml index be4cc4556c..178fde2e9f 100644 --- a/examples/auto_deploy/nano_v3.yaml +++ b/examples/auto_deploy/nano_v3.yaml @@ -6,7 +6,8 @@ enable_chunked_prefill: true attn_backend: flashinfer model_factory: AutoModelForCausalLM skip_loading_weights: false -free_mem_ratio: 0.9 +# TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9884 +free_mem_ratio: 0.88 cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384] kv_cache_config: # disable kv_cache reuse since not supported for hybrid/ssm models diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py index 4a98593c68..87434c48e9 100644 --- a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py +++ b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py @@ -1,6 +1,6 @@ """Compile backend with cudagraph.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import torch import torch.nn as nn @@ -12,7 +12,7 @@ from tensorrt_llm._torch.autotuner import autotune from ...utils.cuda_graph import CudaGraphWarmUpPhase from ...utils.logger import ad_logger -from ..compiler import CompileBackendRegistry, CompilerBackend +from ..compiler import CompileBackendRegistry, CompilerBackend, GetArgsKwargsForBatchSize def _args_kwargs_flatten_spec(in_spec: TreeSpec, *args, **kwargs) -> List[Any]: @@ -31,13 +31,10 @@ class CapturedGraph(nn.Module): def __init__( self, model: nn.Module, - cuda_graph_batch_sizes: List[int], - num_batched_inputs: int, # number of batched, dynamic inputs... + num_batched_inputs: Optional[int] = None, # number of batched, dynamic inputs... ): super().__init__() self.model = model - self.cuda_graph_max_batch_size = max(cuda_graph_batch_sizes) - ad_logger.info(f"Setting {self.cuda_graph_max_batch_size=}") self.num_batched_inputs = num_batched_inputs if num_batched_inputs is not None else 1 self.cudagraphs: Dict[Tuple[int, ...], CUDAGraph] = {} self._input_buffers: List[torch.Tensor] = [ @@ -45,7 +42,6 @@ class CapturedGraph(nn.Module): ] self._out_buffer_flat: List[torch.Tensor] = None self._args_hash: Optional[Tuple[int, ...]] = None - self.cuda_graph_batch_sizes = sorted(cuda_graph_batch_sizes, reverse=True) self._cuda_graph_mem_pool = None # store the in_spec and out_spec during graph capture @@ -55,17 +51,6 @@ class CapturedGraph(nn.Module): def _get_hash(self, flat_args: List[Any]) -> Tuple[int, ...]: return tuple(hash(a) for a in flat_args) - @staticmethod - def round_up_to_closest(batch_sizes: Iterable[int], bs: int) -> Optional[int]: - """Return closest batch size larger or equal to bs.""" - if bs > max(batch_sizes, default=0): - return None - return min(batch_sizes, key=lambda x: (x < bs, abs(x - bs)), default=None) - - def round_to_cuda_batch_size(self, bs: int) -> int: - """Round batch size to the nearest cuda batch size.""" - return self.round_up_to_closest(self.cuda_graph_batch_sizes, bs) - def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph: """Capture and return one cuda graph.""" # warm-up and invoke autotuner @@ -87,11 +72,16 @@ class CapturedGraph(nn.Module): self._cuda_graph_mem_pool = self._cuda_graph_mem_pool or graph.pool() return graph - def capture_graph(self, *args, **kwargs): - """Capture and pre-fetch the graph for variable batch size.""" - # check this is the first time we capture the graph + def capture_graph(self, get_args_kwargs: GetArgsKwargsForBatchSize, batch_sizes: List[int]): + """Capture and pre-fetch the graph for desired batch sizes.""" assert not self.cudagraphs, "Graphs already captured." + # sort batch sizes in descending order + batch_sizes = sorted(batch_sizes, reverse=True) + + # get args, kwargs for the first time for the largest batch size + args, kwargs = get_args_kwargs(batch_sizes[0]) + # flatten args, kwargs for the first time and record in_spec all_args_flat, self._in_spec = _args_kwargs_flatten(*args, **kwargs) @@ -102,23 +92,8 @@ class CapturedGraph(nn.Module): # set the args hash --> this is used to compare the static inputs during graph replay self._args_hash = self._get_hash(args_static) - # sanity checks on the batched inputs - msg_bs = ( - f"Input batch size exceeds maximum CUDA graph batch size. " - f"CUDA graph max batch size: {self.cuda_graph_max_batch_size}, " - f"but got input batch sizes: {[input.shape[0] for input in args_batched]}. " - f"Did you intentionally set the maximal value of cuda_graph_batch_sizes lower " - f"than the max_batch_size? It will fall back to non-CUDA graph forward pass for " - f"batch sizes exceeding the max_batch_size." - ) - if any(self.cuda_graph_max_batch_size < input.shape[0] for input in args_batched): - ad_logger.info(msg_bs) - - # repeat the batched input tensors to the cuda_graph_max_batch_size - self._input_buffers = [ - input[:1].repeat_interleave(self.cuda_graph_max_batch_size, dim=0) - for input in args_batched - ] + # store the input buffers for the largest batch size + self._input_buffers = [a.clone() for a in args_batched] # create new args, kwargs with the input buffers and static args args, kwargs = self._in_spec.unflatten(self._input_buffers + args_static) @@ -126,14 +101,31 @@ class CapturedGraph(nn.Module): # capture output once with cuda_graph_max_batch_size to capture output buffers # store the out_spec at this point with CudaGraphWarmUpPhase(): - ad_logger.info(f"Warm up with {self.cuda_graph_max_batch_size=} before graph capture") + ad_logger.info(f"Warm up with max_batch_size={batch_sizes[0]} before graph capture") out = self.model(*args, **kwargs) self._out_buffer_flat, self._out_spec = tree_flatten(out) # capture graph now for a range of batch sizes - for bs in self.cuda_graph_batch_sizes: + for bs in batch_sizes: ad_logger.info(f"Capturing graph for batch size: {bs}") + # get new args, kwargs for the current batch size + args, kwargs = get_args_kwargs(bs) + all_args_flat = _args_kwargs_flatten_spec(self._in_spec, *args, **kwargs) + args_batched = all_args_flat[: self.num_batched_inputs] + args_static = all_args_flat[self.num_batched_inputs :] + + # assert that static args match the stored hash + assert self._args_hash == self._get_hash(args_static), ( + "Static args mismatch during capture" + ) + + # copy new inputs to input buffers + for i, input_tensor in enumerate(args_batched): + self._input_buffers[i][: input_tensor.shape[0]].copy_( + input_tensor, non_blocking=True + ) + # setup args, kwargs inputs_truncated = [in_buffer[:bs] for in_buffer in self._input_buffers] args, kwargs = self._in_spec.unflatten(inputs_truncated + args_static) @@ -155,12 +147,8 @@ class CapturedGraph(nn.Module): if self._args_hash != self._get_hash(args_static): return self.model(*args, **kwargs) - # Calculate rounded-up shapes for each input - rounded_shapes = [ - (self.round_to_cuda_batch_size(input.shape[0]),) + tuple(input.shape[1:]) - for input in args_batched - ] - combined_shape = sum(rounded_shapes, start=()) + # Calculate combined shape tuple as hash for cudagraph lookup + combined_shape = sum((arg.shape for arg in args_batched), start=()) # regular forward for non-matching shapes if combined_shape not in self.cudagraphs: @@ -188,72 +176,22 @@ class TorchCudagraphCompiler(CompilerBackend): *args_for_init, cuda_graph_batch_sizes: Optional[List[int]] = None, num_batched_inputs: int = 1, - max_batch_size: Optional[int] = None, + get_args_kwargs_for_compile: GetArgsKwargsForBatchSize = None, **kwargs_for_init, ): super().__init__(*args_for_init, **kwargs_for_init) - - # heuristic to identify max batch size - assert max_batch_size or cuda_graph_batch_sizes, ( - "At least one of max_batch_size or cuda_graph_batch_sizes must be provided." - ) - self.max_batch_size = max_batch_size or max(cuda_graph_batch_sizes) - self.num_batched_inputs = num_batched_inputs - if not cuda_graph_batch_sizes: - # Use heuristic which includes commonly-used sizes like 1 and max_bs - self.cuda_graph_batch_sizes = self._get_graph_batch_sizes(self.max_batch_size) - ad_logger.info(f"Using heuristic cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}") - else: - # Sanitize user-provided sizes: clamp to [1, max_batch_size], dedupe, sort desc - # No point capturing CUDA graphs for batch sizes larger than max_batch_size - effective = { - min(max(1, int(b)), int(self.max_batch_size)) - for b in cuda_graph_batch_sizes - if isinstance(b, (int, float)) and b > 0 - } - self.cuda_graph_batch_sizes = sorted(effective, reverse=True) - - # Log if we clamped any values - original_values = [ - int(b) for b in cuda_graph_batch_sizes if isinstance(b, (int, float)) and b > 0 - ] - clamped_values = [v for v in original_values if v > self.max_batch_size] - if clamped_values: - ad_logger.info( - f"Clamped CUDA graph batch sizes {clamped_values} to max_batch_size={self.max_batch_size}" - ) - - ad_logger.info( - f"Using explicit cuda_graph_batch_sizes: requested={cuda_graph_batch_sizes}" - f" -> effective={self.cuda_graph_batch_sizes}" - f" (clamped to [1, {self.max_batch_size}])" - ) + self.cuda_graph_batch_sizes = cuda_graph_batch_sizes or [] + self.get_args_kwargs_for_compile = get_args_kwargs_for_compile @torch.inference_mode() def compile(self) -> CapturedGraph: - captured_model = CapturedGraph( - self.model, - cuda_graph_batch_sizes=self.cuda_graph_batch_sizes, - num_batched_inputs=self.num_batched_inputs, - ) + captured_model = CapturedGraph(self.model, num_batched_inputs=self.num_batched_inputs) # try capturing cudagraph - if self.args is not None or self.kwargs is not None: - captured_model.capture_graph(*self.args, **self.kwargs) + assert self.get_args_kwargs_for_compile is not None, ( + "get_args_kwargs_for_compile must be provided" + ) + captured_model.capture_graph(self.get_args_kwargs_for_compile, self.cuda_graph_batch_sizes) return captured_model - - @staticmethod - def _get_graph_batch_sizes( - max_bs: int, extra: Optional[List[int]] = None, multiplier: int = 128 - ) -> List[int]: - """Heuristic to set batch sizes for graph capture.""" - # do 1, max_bs, and extra as special batch sizes - batch_sizes = {1, max_bs, *(extra or [])} - - # add all multiples of multiplier up to max_bs - batch_sizes.update(range(multiplier, max_bs + 1, multiplier)) - - # return as sorted list - return sorted(batch_sizes, reverse=True) diff --git a/tensorrt_llm/_torch/auto_deploy/compile/compiler.py b/tensorrt_llm/_torch/auto_deploy/compile/compiler.py index fcd83828fe..621c0ab4d8 100644 --- a/tensorrt_llm/_torch/auto_deploy/compile/compiler.py +++ b/tensorrt_llm/_torch/auto_deploy/compile/compiler.py @@ -4,10 +4,13 @@ This is useful as final optimization step for in-framework deployment of our inf """ from abc import ABC, abstractmethod -from typing import Any, Dict, Optional, Tuple, Type +from typing import Any, Callable, Dict, List, Tuple, Type import torch.nn as nn +ArgsKwargs = Tuple[List[Any], Dict[str, Any]] +GetArgsKwargsForBatchSize = Callable[[int], ArgsKwargs] + class CompileBackendRegistry: _backend_registry: Dict[str, Type["CompilerBackend"]] = {} @@ -32,16 +35,8 @@ class CompileBackendRegistry: class CompilerBackend(ABC): - def __init__( - self, - model: nn.Module, - args: Tuple[Any, ...], - kwargs: Optional[Dict[str, Any]] = None, - **compiler_kwargs, - ): + def __init__(self, model: nn.Module, **compiler_kwargs): self.model = model - self.args = args - self.kwargs = kwargs or {} @abstractmethod def compile(self) -> nn.Module: diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml index 93a8a540cf..4200be6c6a 100644 --- a/tensorrt_llm/_torch/auto_deploy/config/default.yaml +++ b/tensorrt_llm/_torch/auto_deploy/config/default.yaml @@ -143,8 +143,6 @@ transforms: ############################################################################################ # SWITCH TO CACHED+FLATTENED ATTENTION + INITIALIZE CACHES ############################################################################################ - update_in_out_nodes: - stage: cache_init insert_cached_attention: stage: cache_init backend: flashinfer diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py index 6bfd23b28d..8e7b7f481a 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py @@ -24,6 +24,259 @@ from ..utils.logger import ad_logger Constant = Union[int, float, str, None] +class InputBuffer: + """Manages contiguous memory buffers for efficient host-to-device transfers. + + This class consolidates multiple tensors into a single contiguous buffer on both + host (pinned memory) and device. This enables efficient bulk transfers with a + single async H2D copy instead of multiple small copies. + + The buffer layout places the truncatable tensor (typically cache_loc) last, + allowing partial copies when the full buffer isn't needed. + + Usage: + 1. Create InputBuffer with tensor specifications (name, max_numel, dtype) + 2. Use store() to write data to the pinned host buffer + 3. Call copy_to_device() to perform a single async H2D transfer + 4. Access device tensors via get_view() + """ + + def __init__(self, tensor_specs: List[Tuple[str, int, torch.dtype]]): + """Initialize the InputBuffer. + + Args: + tensor_specs: Ordered list of (name, max_numel, dtype) tuples. + The last tensor is treated as truncatable during copy. + """ + self._tensor_specs = {name: (numel, dtype) for name, numel, dtype in tensor_specs} + self._tensor_order = [name for name, _, _ in tensor_specs] + + # Calculate offsets for each tensor (aligned to dtype's element size) + self._offsets: Dict[str, int] = {} + self._byte_sizes: Dict[str, int] = {} + + current_offset = 0 + for name, numel, dtype in tensor_specs: + # Align to the tensor's element size for proper memory access + alignment = dtype.itemsize + aligned_offset = (current_offset + alignment - 1) // alignment * alignment + byte_size = numel * dtype.itemsize + self._offsets[name] = aligned_offset + self._byte_sizes[name] = byte_size + current_offset = aligned_offset + byte_size + + # Total buffer size + self._total_bytes = current_offset + + # Allocate contiguous buffers (device buffer starts on default device, use to() to move) + self._device_buffer = torch.empty(self._total_bytes, dtype=torch.uint8) + self._host_buffer = torch.empty( + self._total_bytes, dtype=torch.uint8, device="cpu", pin_memory=True + ) + + # Create persistent views into device and host buffers + # Persistent views help us identify the arguments as static during graph capture. + self._device_views = self._create_views(self._device_buffer) + self._host_views = self._create_views(self._host_buffer) + + # Track current lengths for each tensor (for truncation optimization) + self._current_lengths: Dict[str, int] = {name: 0 for name in self._tensor_order} + + def _create_views(self, buffer: torch.Tensor) -> Dict[str, torch.Tensor]: + """Create views into the given buffer for each tensor.""" + views = {} + for name in self._tensor_order: + offset = self._offsets[name] + byte_size = self._byte_sizes[name] + _, dtype = self._tensor_specs[name] + views[name] = buffer[offset : offset + byte_size].view(dtype) + return views + + @property + def tensor_names(self) -> List[str]: + """Return the list of tensor names in buffer order.""" + return self._tensor_order.copy() + + @property + def _truncatable_name(self) -> str: + """Return the name of the truncatable tensor.""" + return self._tensor_order[-1] + + @property + def total_bytes(self) -> int: + """Total size of the buffer in bytes.""" + return self._total_bytes + + @property + def device(self) -> torch.device: + """Return the device of the device buffer.""" + return self._device_buffer.device + + def get_view(self, name: str) -> torch.Tensor: + """Get the device tensor view for the specified name. + + Args: + name: Name of the tensor. + + Returns: + A view into the device buffer for the specified tensor. + """ + return self._device_views[name] + + def get_view_at_current_length(self, name: str) -> torch.Tensor: + """Get the device tensor view for the specified name at the current length. + + Args: + name: Name of the tensor. + + Returns: + A view into the device buffer for the specified tensor at the current length. + """ + return self._device_views[name][: self._current_lengths[name]] + + def get_host_view(self, name: str) -> torch.Tensor: + """Get the host tensor view for the specified name. + + Args: + name: Name of the tensor. + + Returns: + A view into the pinned host buffer for the specified tensor. + """ + return self._host_views[name] + + def get_capacity(self, name: str) -> int: + """Get the maximum number of elements for the specified tensor. + + Args: + name: Name of the tensor. + + Returns: + Maximum number of elements that can be stored. + """ + numel, _ = self._tensor_specs[name] + return numel + + def get_current_length(self, name: str) -> int: + """Get the current stored length for the specified tensor. + + Args: + name: Name of the tensor. + + Returns: + Number of elements currently stored in the tensor. + """ + return self._current_lengths[name] + + def store( + self, + name: str, + data: List[Number], + fill_value: Optional[Number] = None, + ) -> int: + """Store data into the host buffer. + + Args: + name: Name of the tensor to store to. + data: List of values to store. + fill_value: Optional value to fill the entire tensor with before storing. + If None, only the provided data is written. + + Returns: + Number of elements stored. + """ + numel, dtype = self._tensor_specs[name] + host_view = self.get_host_view(name) + + # Fill with default value if specified + if fill_value is not None: + host_view.fill_(fill_value) + + # Convert list to tensor and copy to host buffer + length = len(data) + assert length <= numel, f"Data too large for buffer '{name}': {length} > {numel}" + + temp_tensor = torch.tensor(data, dtype=dtype) + host_view[:length].copy_(temp_tensor) + + self._current_lengths[name] = length + return length + + def copy_to_device(self) -> None: + """Copy from host buffer to device buffer. + + Uses the current length of the truncatable tensor (last in spec) to minimize + transfer size. All tensors before the truncatable one are fully copied. + """ + # Calculate bytes to copy based on truncatable tensor's current length + truncatable_len = self._current_lengths[self._truncatable_name] + truncatable_offset = self._offsets[self._truncatable_name] + truncatable_dtype = self._tensor_specs[self._truncatable_name][1] + copy_bytes = truncatable_offset + truncatable_len * truncatable_dtype.itemsize + + # Single async copy + with nvtx_range("ad_input_buffer_h2d_copy"): + self._device_buffer[:copy_bytes].copy_( + self._host_buffer[:copy_bytes], non_blocking=True + ) + + def resize(self, name: str, new_capacity: int) -> None: + """Resize a tensor's capacity. + + This operation is only supported for the last tensor in the buffer to avoid + complex offset recalculations. + + Args: + name: Name of the tensor to resize. + new_capacity: New maximum number of elements for the tensor. + """ + assert name == self._truncatable_name, ( + f"Can only resize the last tensor in the buffer ('{self._truncatable_name}'). " + f"Attempted to resize '{name}'." + ) + + old_numel, dtype = self._tensor_specs[name] + if new_capacity <= old_numel: + return # No need to resize if new capacity is smaller or equal + + # Update tensor specs + self._tensor_specs[name] = (new_capacity, dtype) + + # Calculate new byte size for this tensor + new_byte_size = new_capacity * dtype.itemsize + self._byte_sizes[name] = new_byte_size + + # Update total bytes (offset stays the same since it's the last tensor) + self._total_bytes = self._offsets[name] + new_byte_size + + # Resize device buffer in-place + self._device_buffer.resize_(self._total_bytes) + + # Host buffer must be re-allocated to ensure we have pinned memory + old_host_buffer = self._host_buffer + self._host_buffer = torch.empty( + self._total_bytes, dtype=torch.uint8, device="cpu", pin_memory=True + ) + self._host_buffer[: old_host_buffer.numel()].copy_(old_host_buffer) + del old_host_buffer + + # Recreate views after the update + self._device_views = self._create_views(self._device_buffer) + self._host_views = self._create_views(self._host_buffer) + + def to(self, *args, **kwargs) -> None: + """Move the device buffer to a new device/dtype. + + Note: This recreates the device views after moving. + """ + old_device = self._device_buffer.device + self._device_buffer = self._device_buffer.to(*args, **kwargs) + + # Recreate views if device changed + if old_device != self._device_buffer.device: + self._device_views = self._create_views(self._device_buffer) + + class CacheConfig(BaseModel): """Cache configuration for attention-related dtypes.""" @@ -83,21 +336,41 @@ class SequenceInfo: Those are extra arguments that can be provided to the interface and they are stored as follows: - _extra_args: dictionary of extra arguments with currently active values. - ### CACHE ARGUMENTS NEEDED FOR ATTENTION OPERATORS FOR FLATTENED SEQUENCES + CACHES ############ + ### AVAILABLE ARGUMENTS TO BE ADDED BY THE TRANSFORMS IF NEEDED ################################ - seq_len: [s_0, s_1, ..., s_{b-1}] such that s_total = sum(s_i) Describes how long each sequence is. For example, input_ids[:s_0] will correspond to sequence 0 in the batch and input_ids[s_0:s_1] will correspond to sequence 1 in the batch. + - cu_seqlen: [0, s_0, s_0+s_1, ..., s_total] + Cumulative sequence lengths of shape [b+1]. cu_seqlen[i+1] - cu_seqlen[i] gives the length + of sequence i. - input_pos: [pos_0, ..., pos_{b-1}] - Corresponds to the total number of tokens that has been already been cached for each sequence - in the batch. - - cache_loc: [c0, ...., c_{np-1}] where np is total number of pages allocated to describe all - sequences in the batch. + Corresponds to the total number of tokens that have already been cached for each sequence + in the batch (i.e., the starting position in the cache for new tokens). - pages_per_seq: [ps_0, ps_1, ..., ps_{b-1}] where ps_i is the number of pages allocated for - sequence i. Note that, for example, cache_loc[p_0:p_1] will correspond to the pages associated - with sequence 1 in the batch. - - slot_idx: [s_0, s_1, ..., s_{b-1}] + sequence i. Note that, for example, cache_loc[sum(ps_0:ps_{i-1}):sum(ps_0:ps_i)] will + correspond to the pages associated with sequence i in the batch. + - cu_num_pages: [0, ps_0, ps_0+ps_1, ..., sum(ps_i)] + Cumulative number of pages of shape [b+1]. cu_num_pages[i+1] - cu_num_pages[i] gives the + number of pages for sequence i. + - seq_len_with_cache: [pos_0+s_0, pos_1+s_1, ..., pos_{b-1}+s_{b-1}] + Total sequence length including cached tokens for each sequence (input_pos + seq_len). + - last_page_len: [lpl_0, lpl_1, ..., lpl_{b-1}] + Number of valid tokens in the last page for each sequence. Computed as + (seq_len_with_cache - 1) % page_size + 1. + - slot_idx: [slot_0, slot_1, ..., slot_{b-1}] Corresponds to the slot index of each sequence in the batch. + - use_initial_states: [bool_0, bool_1, ..., bool_{b-1}] + Per-sequence boolean indicating whether initial states should be used (True if input_pos > 0). + - batch_info: [num_prefill, num_prefill_tokens, num_decode] + Batch metadata containing the number of prefill sequences, total prefill tokens, and number + of decode sequences. + - cache_loc: [c_0, c_1, ..., c_{np-1}] where np is total number of pages allocated to describe + all sequences in the batch. Each value is a page index in the cache. + - _gather_idx: [g_0, g_1, ..., g_{s_total-1}] + Gather indices used by the overlap scheduler to reorder input tokens. + - _mask_scatter_indices: [m_0, m_1, ..., m_{s_total-1}] + Mask scatter indices used by the overlap scheduler to scatter results back. ################################################################################################ @@ -119,7 +392,6 @@ class SequenceInfo: page_size: int = 0, max_num_tokens: Optional[int] = None, vocab_size_padded: Optional[int] = None, - chunk_size: Optional[int] = None, ): """Initialize the SequenceInfo object. @@ -146,15 +418,14 @@ class SequenceInfo: self.max_batch_size = max_batch_size self.page_size = page_size if page_size > 0 else max_seq_len self.vocab_size_padded = vocab_size_padded - self.chunk_size = chunk_size - # Chunk size is an input to a custom op, so we need to set a default value if it is not provided. - if self.chunk_size is None: - self.chunk_size = 128 # NOTE (lucaslie): WAR to address issue when using flashinfer attention with # (max_batch_size, max_seq_len) input in trtllm runtime. # see https://github.com/NVIDIA/TensorRT-LLM/issues/4504 max_seq_len_adjusted = self.max_seq_len + 1 + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9883 clean up this hack + self.max_state_slots = max_batch_size + 1 + # if the provided max_num_tokens is less than the max_batch_size * max_seq_len_adjusted, # we use the provided max_num_tokens. If max_num_tokens provided is more, we still use # max_batch_size * max_seq_len_adjusted since the extra tokens cannot be used. @@ -188,31 +459,47 @@ class SequenceInfo: ) # indicator if extra args are activated that are needed for cached attention backends - self._is_cached_attn = False + self._use_flattened_layout = False # TENSOR FIELDS ############################################################################ - self._args_device: Dict[str, torch.Tensor] = { + # Define tensor specifications for the InputBuffer + # Order matters: cache_loc is placed LAST for truncation optimization during H2D copy + # Format: (name, max_numel, dtype) + tensor_specs: List[Tuple[str, int, torch.dtype]] = [ # TENSOR FIELDS FOR UNCACHED ATTENTION - "input_ids": torch.ones(self.max_num_tokens, dtype=torch.int), - "position_ids": torch.zeros(self.max_num_tokens, dtype=torch.long), + ("input_ids", self.max_num_tokens, torch.int), + ("position_ids", self.max_num_tokens, torch.long), # TENSOR FIELDS FOR CACHED ATTENTION - "seq_len": torch.empty(self.max_batch_size, dtype=torch.int), - "input_pos": torch.empty(self.max_batch_size, dtype=torch.int), - "cache_loc": torch.empty(max_num_cache_loc_assignments, dtype=torch.int), - "pages_per_seq": torch.empty(self.max_batch_size, dtype=torch.int), - "slot_idx": torch.empty(self.max_batch_size, dtype=torch.long), + ("seq_len", self.max_batch_size, torch.int), + ("cu_seqlen", self.max_batch_size + 1, torch.int), + ("input_pos", self.max_batch_size, torch.int), + ("pages_per_seq", self.max_batch_size, torch.int), + ("cu_num_pages", self.max_batch_size + 1, torch.int), + ("seq_len_with_cache", self.max_batch_size, torch.int), + ("last_page_len", self.max_batch_size, torch.int), + ("slot_idx", self.max_batch_size, torch.long), + ("use_initial_states", self.max_batch_size, torch.bool), + ("batch_info", 3, torch.int), # OTHER FIELDS WHERE WE NEED EFFICIENT HOST<>DEVICE TRANSFER - "_gather_idx": torch.empty(self.max_num_tokens, dtype=torch.int), + ("_gather_idx", self.max_num_tokens, torch.int), + ("_mask_scatter_indices", self.max_num_tokens, torch.int), + # cache_loc is LAST for truncation optimization (it's the largest tensor) + ("cache_loc", max_num_cache_loc_assignments, torch.int), + ] + + # Create the InputBuffer that manages contiguous host and device memory + # Starts on default device; use to() to move to target device + self._input_buffer = InputBuffer(tensor_specs) + + # Initialize args_list from tensor specs + self._args_list: Dict[str, List[int]] = { + name: [0] * numel for name, numel, _ in tensor_specs } - self._args_host: Dict[str, List[int]] = { - k: v.tolist() for k, v in self._args_device.items() - } - # NOTE: order of keys is relevant here! - self._uncached_arg_names = ("input_ids", "position_ids") - self._cached_arg_names = ("seq_len", "input_pos", "cache_loc", "pages_per_seq", "slot_idx") - # page_size is the size of attentionkv-cache pages. - # chunk_size is used in mamba prefill kernels to split the context into chunks. - self._cached_constants = ("page_size", "chunk_size") + + self._active_args = ("input_ids", "position_ids") + self._shapeable_args = ("input_ids", "position_ids") + # Args that should be returned from host (pinned memory) instead of device in _named_args + self._host_return_args = ("batch_info",) ############################################################################################ # EXTRA TENSOR FIELDS ###################################################################### @@ -224,7 +511,7 @@ class SequenceInfo: @property def device(self) -> torch.device: - return self._args_device["input_ids"].device + return self._input_buffer.device def _shape_for_forward(self, tnsr: torch.Tensor) -> torch.Tensor: """Shape the tensor for the forward pass based on the current attention mode. @@ -238,7 +525,7 @@ class SequenceInfo: # check if we are still running uncached attention in which case we are also still # operate on unflattened tensors with explicit [batch_size, seq_len, ...] shape # generate-only batches are also formatted like this (i.e. [b, 1]) - if not self._is_cached_attn or self.is_generate: + if not self._use_flattened_layout or self.is_generate: bs = len(self.seq_len) sl = self.seq_len[0] # use [1,total_len] shape to indicate non-generate-only batch for cached attention @@ -248,21 +535,27 @@ class SequenceInfo: # truncate to total tokens now, reshape, and return return tnsr[: self.total_num_tokens].view(bs, sl, *tnsr.shape[1:]) - def _named_args( - self, include_extra_args: bool = True, include_cached_args: bool = True - ) -> Dict[str, torch.Tensor]: - # start with uncached args and shape them along the way - args = {k: self._shape_for_forward(self._args_device[k]) for k in self._uncached_arg_names} + def _named_args(self, include_extra_args: bool = True) -> Dict[str, torch.Tensor]: + # Build args dict, using host views for _host_return_args, device views otherwise + args = {} + for name in self._active_args: + if name in self._host_return_args: + view = self._input_buffer.get_host_view(name) + else: + view = self._input_buffer.get_view(name) + args[name] = self._shape_for_forward(view) if name in self._shapeable_args else view # check other args to include if include_extra_args: args.update(self._extra_args) - if include_cached_args: - args.update({k: self._args_device[k] for k in self._cached_arg_names}) - return args + @property + def available_args(self) -> Set[str]: + """Return a list of available arguments.""" + return set(self._input_buffer.tensor_names) + @property def named_args(self) -> Dict[str, torch.Tensor]: """Return a dictionary of named arguments. @@ -273,76 +566,28 @@ class SequenceInfo: Cached arguments are only included if the attention mode is cached to reflect that after switching to cached attention, the cached arguments are required for a forward pass. """ - return self._named_args(include_extra_args=True, include_cached_args=self._is_cached_attn) - - @property - def named_standard_args(self) -> Dict[str, torch.Tensor]: - """Return a dictionary of named standard arguments. - - We define standard arguments as the arguments that are part of the model's forward function - by default (i.e., without the extra arguments). - - Just liked ``named_args``, this property includes cached attention arguments if the - attention mode is cached. - """ - return self._named_args(include_extra_args=False, include_cached_args=self._is_cached_attn) + return self._named_args(include_extra_args=True) @property def args(self) -> Tuple[torch.Tensor, ...]: """Return a tuple of arguments.""" return tuple(self.named_args.values()) - @property - def args_for_prepare_metadata(self) -> Tuple[str, ...]: - """Return a tuple of node/tensor arguments for the prepare_metadata op. - - The ``prepare_metadata`` interface expects the following arguments: - - 1. ``args_for_prepare_metadata`` as nodes, i.e., as input-dependent tensors. - 2. ``const_args_for_prepare_metadata`` as constants that can directly by passed in as args - to the corresponding ``prepare_metadata`` node/op. - - This interface handles the tensor/node arguments part and can be used by compiler passes - like ``insert_cached_attention`` to extract the constant arguments and add them to the - ``prepare_metadata`` node/op. - """ - # NOTE: for now we do _not_ include input_ids since we are not guaranteed that input_ids - # is part of the graph, e.g., in situations where the graph is a submodule of the overall - # model. In such instances, the graph usually sees inputs_embeds. However, we assume for - # now that position_ids is always part of the graph. - return ("position_ids",) + self._cached_arg_names - - @property - def const_args_for_prepare_metadata(self) -> Tuple[Constant, ...]: - """Return a tuple of extra (const, non-tensor) arguments for the prepare_metadata op. - - The ``prepare_metadata`` interface expects the following arguments: - - 1. ``args_for_prepare_metadata`` as nodes, i.e., as input-dependent tensors. - 2. ``const_args_for_prepare_metadata`` as constants that can directly by passed in as args - to the corresponding ``prepare_metadata`` node/op. - - This interface handles the constant arguments part and can be used by compiler passes like - ``insert_cached_attention`` to extract the constant arguments and add them to the - ``prepare_metadata`` node/op. - """ - return tuple(getattr(self, k) for k in self._cached_constants) - @property def seq_len(self) -> List[int]: - return self._args_host["seq_len"].copy() + return self._args_list["seq_len"].copy() @property def input_pos(self) -> List[int]: - return self._args_host["input_pos"].copy() + return self._args_list["input_pos"].copy() @property def cache_loc(self) -> List[int]: - return self._args_host["cache_loc"].copy() + return self._args_list["cache_loc"].copy() @property def pages_per_seq(self) -> List[int]: - return self._args_host["pages_per_seq"].copy() + return self._args_list["pages_per_seq"].copy() @property def num_sequences(self) -> int: @@ -363,9 +608,18 @@ class SequenceInfo: @num_pages.setter def num_pages(self, value): self._num_pages = value - # update the cache_loc tensor - if self._args_device["cache_loc"].numel() < value: - self._args_device["cache_loc"].resize_(value) + # Check if we need to resize cache_loc (it's the last tensor in the buffer) + cache_loc_capacity = self._input_buffer.get_capacity("cache_loc") + if value > cache_loc_capacity: + ad_logger.info( + f"Resizing cache_loc capacity from {cache_loc_capacity} to {value} " + f"to accommodate num_pages={value}" + ) + # Resize the input buffer (cache_loc is the last tensor, so this is supported) + self._input_buffer.resize("cache_loc", value) + # Also resize the args_list to match + old_size = len(self._args_list["cache_loc"]) + self._args_list["cache_loc"].extend([0] * (value - old_size)) @property def is_paged(self) -> bool: @@ -420,6 +674,7 @@ class SequenceInfo: pages_per_seq = [len(p) for p in page_assignments] return cache_loc_flat, pages_per_seq + # TODO: remove after updating all cached backends @classmethod def _get_sanitized_seq_len( cls, input_or_position_ids: torch.Tensor, seq_len: torch.Tensor @@ -459,7 +714,7 @@ class SequenceInfo: _, s = input_or_position_ids.shape[:2] num_seq = cls._get_sanitized_num_sequences(input_or_position_ids, seq_len) if s > 1: - return seq_len[:num_seq].detach().clone() + return seq_len[:num_seq].clone() else: return torch.ones(num_seq, dtype=seq_len.dtype, device=seq_len.device) @@ -481,31 +736,29 @@ class SequenceInfo: num_seq = b return num_seq - def switch_to_cached_attn_inputs(self) -> List[str]: - """Switch to inputs for cached+flattened attention operators. + def activate_arg(self, arg_name: str) -> bool: + """Activate a desired argument. + + The first time this function is called we will also switch to the flattened input layout. Returns: - List[str]: List of new argument names that are now activated. - - This function will change the inputs provided by the interface from the arguments expected - by regular attention in PyTorch (SDPA-style) to the arguments needed once we use attention - operators with cache support and flattened sequences. - - NOTE: The graph inference optimizer is responsible for ensuring the the new inputs are - correctly reflected in the graph after this function is called. + True if the argument was activated, False if already activated. """ - assert not self._is_cached_attn, "Cached+flattened attention already activated" - self._is_cached_attn = True - return list(self._cached_arg_names) + assert arg_name in self.available_args, f"{arg_name=} not found in {self.available_args}" + self._use_flattened_layout = True + if arg_name not in self._active_args: + self._active_args += (arg_name,) + return True + return False def to(self, *args, **kwargs) -> None: - def _move_dict(d: Dict[str, torch.Tensor]) -> None: - for k, v in d.items(): - if v is not None: - d[k] = v.to(*args, **kwargs) + # Move the InputBuffer (which recreates views automatically) + self._input_buffer.to(*args, **kwargs) - _move_dict(self._args_device) - _move_dict(self._extra_args) + # Move extra args + for k, v in self._extra_args.items(): + if v is not None: + self._extra_args[k] = v.to(*args, **kwargs) def set_example_sequence( self, @@ -525,7 +778,6 @@ class SequenceInfo: for ids_one_seq in input_ids ] cache_loc = list(range(sum(pages_per_seq))) - page_assignments = self._get_page_assignments(cache_loc, pages_per_seq) # vanilla slot indices slot_idx = list(range(len(input_ids))) @@ -534,7 +786,8 @@ class SequenceInfo: input_ids, position_ids, # will be auto-inferred if None input_pos=0, # no cache history - page_assignments=page_assignments, # vanilla page assignments + cache_loc=cache_loc, # vanilla page assignments + pages_per_seq=pages_per_seq, # vanilla page assignments slot_idx=slot_idx, # vanilla slot indices **extra_args, ) @@ -546,9 +799,9 @@ class SequenceInfo: input_ids = torch.ones(self.max_batch_size, seq_len, dtype=torch.int).tolist() self.set_example_sequence(input_ids) - def set_generate_only_batch(self) -> None: + def set_generate_only_batch(self, batch_size: Optional[int] = None) -> None: """Set an example sequence for generate-only batch.""" - self.set_example_sequence([[1]] * self.max_batch_size) + self.set_example_sequence([[1]] * (batch_size or self.max_batch_size)) def reset(self) -> None: """Reset the sequence information. @@ -571,33 +824,29 @@ class SequenceInfo: name: str, tnsr_like: List[Number], reset_val: Optional[Number] = None, + force_copy: bool = False, ) -> None: - """Store the argument on the host and copy to the device in a non-blocking fashion. + """Store the argument into the pinned host buffer for later batch transfer to device. + + The data is stored in the host-side pinned memory buffer managed by InputBuffer. + The actual H2D transfer happens in a single batch at the end of nest_sequences(). Args: name: Name of the argument to store. tnsr_like: List of values to store. - reset_val: Value to reset/fill the full tensor on the device to before writing to it. + reset_val: Value to reset/fill the tensor with before writing data. + force_copy: Whether to force immediate copy to device (for use outside nest_sequences). """ - with nvtx_range(f"ad_store_seq_info_arg_{name}"): - tnsr_device = self._args_device[name] + with nvtx_range(f"ad_store_on_host_seq_info_arg_{name}"): + # Always store list object for Python access + self._args_list[name] = tnsr_like.copy() - # store list object on the host - self._args_host[name] = tnsr_like.copy() + # Only store to buffer when the argument is active or force_copy is True + if not (name in self._active_args or force_copy): + return - # pin the memory on the host - tnsr_host = torch.tensor(tnsr_like, dtype=tnsr_device.dtype, pin_memory=True) - - # check for available space - assert tnsr_device.numel() >= tnsr_host.numel(), ( - f"device tensor {name} is too small, available: {tnsr_device.numel()}, " - f"required: {tnsr_host.numel()}" - ) - - # reset/copy to the device in a non-blocking fashion - if reset_val is not None: - tnsr_device.fill_(reset_val) - tnsr_device[: len(tnsr_like)].copy_(tnsr_host, non_blocking=True) + # Store to the InputBuffer's pinned host memory + self._input_buffer.store(name, tnsr_like, fill_value=reset_val) def _store_extra_arg( self, name: str, tnsr_like: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] @@ -627,54 +876,67 @@ class SequenceInfo: self, input_ids: Sequence[Sequence[int]], position_ids: Optional[Sequence[Sequence[int]]] = None, + seq_len: Optional[Sequence[int]] = None, input_pos: Optional[Union[Sequence[int], int]] = None, - page_assignments: Optional[Sequence[Sequence[int]]] = None, + batch_info: Optional[Sequence[int]] = None, + cu_seqlen: Optional[Sequence[int]] = None, + cache_loc: Optional[Sequence[int]] = None, + pages_per_seq: Optional[Sequence[int]] = None, + cu_num_pages: Optional[Sequence[int]] = None, + seq_len_with_cache: Optional[Sequence[int]] = None, + last_page_len: Optional[Sequence[int]] = None, slot_idx: Optional[Sequence[int]] = None, + use_initial_states: Optional[Sequence[bool]] = None, + _gather_idx: Optional[Sequence[int]] = None, + _mask_scatter_indices: Optional[Sequence[int]] = None, **extra_args: Dict[str, Union[torch.Tensor, Sequence[torch.Tensor]]], ) -> None: """Create and store sequence information for the next forward pass. Args: input_ids: List of sequences of input_ids. - position_ids: List of sequences of position_ids for each token. - input_pos: Absolute starting position in the cache for each sequence. - page_assignments: List of sequences of page assignments for each sequence. - slot_idx: List of slot indices for each sequence. + position_ids: List of sequences of position_ids for each token. If None, auto-inferred + from input_pos and seq_len. + seq_len: List of sequence lengths for each sequence. If None, inferred from input_ids. + input_pos: Absolute starting position in the cache for each sequence. Can be a single + int (applied to all sequences) or a list of ints. + batch_info: Batch metadata as [num_prefill, num_prefill_tokens, num_decode]. If None, + auto-computed from seq_len. + cu_seqlen: Cumulative sequence lengths of shape [b+1]. If None, auto-computed from + seq_len. + cache_loc: Flat list of page indices for all sequences. Must be provided together with + pages_per_seq. + pages_per_seq: Number of pages allocated per sequence. Must be provided together with + cache_loc. + cu_num_pages: Cumulative number of pages of shape [b+1]. If None, auto-computed from + pages_per_seq. + seq_len_with_cache: Total sequence length including cached tokens (input_pos + seq_len) + for each sequence. If None, auto-computed. + last_page_len: Number of valid tokens in the last page for each sequence. If None, + auto-computed from seq_len_with_cache. + slot_idx: Slot index for each sequence in the batch. + use_initial_states: Per-sequence boolean indicating if the initial states should be + used. If None, auto-computed as (input_pos > 0). + _gather_idx: Gather indices for the overlap scheduler to reorder input tokens. + _mask_scatter_indices: Mask scatter indices for the overlap scheduler. extra_args: Extra arguments to be stored in the interface. This i/f will ensure that all sequence info args are updated accordingly. Reset values are chosen as "neutral" values so that for cases like rounding up batch sizes for cudagraph we only write to unused buffers/caches. """ - ### UPDATE METADATA ######################################################################## - # update metadata first since it's useful for other updates to have up-to-date information - - # set new sequence lengths --> resetting the remaining entries to zero is important to help - # us discern the actual number of sequences in the batch. - self._store_arg("seq_len", [len(ids) for ids in input_ids], reset_val=0) + ### UPDATE SEQUENCE LENGTH AND INPUT POSITION FIRST SINCE IT'S USED FOR OTHER UPDATES ###### + if seq_len is None: + seq_len = [len(ids) for ids in input_ids] + self._store_arg("seq_len", seq_len) # check for updated input_pos (i.e. cache start position) if input_pos is not None: self._store_arg( "input_pos", [input_pos] * self.num_sequences if isinstance(input_pos, int) else input_pos, - reset_val=0, ) - # check for updated page_assignments - if page_assignments is not None: - cache_loc, pages_per_seq = self._get_cache_locations_and_pages_per_sequence( - page_assignments - ) - free_cache_loc = self._get_unique_value(set(cache_loc), self.num_pages) - self._store_arg("cache_loc", cache_loc, reset_val=free_cache_loc) - self._store_arg("pages_per_seq", pages_per_seq, reset_val=1) - - # check for updated slot_idx - if slot_idx is not None: - free_slot_idx = self._get_unique_value(set(slot_idx), self.max_batch_size) - self._store_arg("slot_idx", slot_idx, reset_val=free_slot_idx) - ### UPDATE MAIN INPUTS ##################################################################### # set new input_ids and make sure to flatten it self._store_arg("input_ids", self._flatten(input_ids)) @@ -687,37 +949,98 @@ class SequenceInfo: ] self._store_arg("position_ids", self._flatten(position_ids)) + ### UPDATE OTHER (DERIVATIVE) METADATA ##################################################### + # check for updated batch_info_tensor + if batch_info is None: + num_prefill = sum(s_l > 1 for s_l in seq_len) + num_prefill_tokens = sum(s_l for s_l in seq_len if s_l > 1) + num_decode = len(seq_len) - num_prefill + batch_info = [num_prefill, num_prefill_tokens, num_decode] + self._store_arg("batch_info", batch_info) + + if cu_seqlen is None: + cu_seqlen = torch.zeros(len(seq_len) + 1, dtype=torch.int) + cu_seqlen[1:] = torch.cumsum(torch.tensor(seq_len), dim=0) + cu_seqlen = cu_seqlen.tolist() + self._store_arg("cu_seqlen", cu_seqlen) + + # check for updated page_assignments + assert (cache_loc is None) == (pages_per_seq is None), ( + "cache_loc and pages_per_seq must beeither both None or both set" + ) + if cache_loc is not None and pages_per_seq is not None: + self._store_arg("cache_loc", cache_loc) + self._store_arg("pages_per_seq", pages_per_seq) + + # update cumulative number of pages + if cu_num_pages is None: + pages_per_seq = self.pages_per_seq + cu_num_pages = torch.zeros(len(pages_per_seq) + 1, dtype=torch.int) + cu_num_pages[1:] = torch.cumsum(torch.tensor(pages_per_seq), dim=0) + cu_num_pages = cu_num_pages.tolist() + self._store_arg("cu_num_pages", cu_num_pages) + + # update sequence length with cache + if seq_len_with_cache is None: + seq_len_with_cache = [i_p + s_l for i_p, s_l in zip(self.input_pos, self.seq_len)] + self._store_arg("seq_len_with_cache", seq_len_with_cache) + + # update last page length + if last_page_len is None: + last_page_len = [(slwc - 1) % self.page_size + 1 for slwc in seq_len_with_cache] + self._store_arg("last_page_len", last_page_len) + + # check for updated slot_idx + if slot_idx is not None: + self._store_arg("slot_idx", slot_idx) + + # check for updated use_initial_states + if use_initial_states is None: + use_initial_states = [i_p > 0 for i_p in self.input_pos] + self._store_arg("use_initial_states", use_initial_states) + + ### UPDATE OVERLAP SCHEDULER METADATA ###################################################### + # check for updated _gather_idx + if _gather_idx is not None: + self._store_arg("_gather_idx", _gather_idx, force_copy=True) + + # check for updated _mask_scatter_indices + if _mask_scatter_indices is not None: + self._store_arg("_mask_scatter_indices", _mask_scatter_indices, force_copy=True) + ### UPDATE EXTRA INPUTS #################################################################### self._extra_args = {} for key, value in extra_args.items(): self._store_extra_arg(key, value) + ### BATCH COPY TO DEVICE ################################################################### + # Perform a single async H2D copy for all device tensors + # The copy is truncated at the end of cache_loc to minimize transfer size + self._input_buffer.copy_to_device() + @nvtx_range("ad_rescatter_input_ids") - def rescatter_input_ids( - self, ungathered_input_ids: torch.Tensor, gather_idx: List[int], scatter_ref: int - ): + def rescatter_input_ids(self, ungathered_input_ids: torch.Tensor): """Re-scatter the provided ungathered input ids into the input_ids tensor. Args: - ungathered_input_ids: The input ids on the device from which to gather. - gather_idx: The list of indices to gather from the ungathered_input_ids. - scatter_ref: The reference index to scatter to in input_ids via masked scatter. + ungathered_input_ids: The input ids on the device from which to gather using the stored + gather and mask scatter indices. Returns: None This function will assume that we are in a generate-only batch. """ - # store the new gather indices - self._store_arg("_gather_idx", gather_idx) + # retrieve input_ids and gather_ids on device + input_ids_device = self._input_buffer.get_view_at_current_length("input_ids") + gather_ids_device = self._input_buffer.get_view_at_current_length("_gather_idx") + mask_scatter_indices_device = self._input_buffer.get_view_at_current_length( + "_mask_scatter_indices" + ) - # gather the provided input ids in a streaming fashion - gather_ids_device = self._args_device["_gather_idx"][: len(gather_idx)] - packed_input_ids = ungathered_input_ids[gather_ids_device] - - # re-scatter the provided input ids into the input_ids tensor - input_ids_device = self._args_device["input_ids"] - input_ids_device.masked_scatter_(input_ids_device == scatter_ref, packed_input_ids) + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input_ids, gather_ids_device, mask_scatter_indices_device, input_ids_device + ) @nvtx_range("ad_unnest_sequences") def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]: @@ -796,7 +1119,8 @@ class AttentionDescriptor(ABC): ``` def attention_op( *qkv, # list of tensors corresponding to Q, K, V as in source attention op - *metadata, # global info about the sequences as returned by the prepare_metadata op + *meta_std, # standard metadata fields identified by matching arg names! + *meta_extra,# metadata about the sequences as returned by the prepare_metadata op *caches, # contains layer-specific caches per provided cache initializers *buffers, # global buffers used by the attention op as provided by buffer initializers *constants, # basic arguments (int, float, str, None) added as CONSTANTS in the graph @@ -814,31 +1138,42 @@ class AttentionDescriptor(ABC): @classmethod @abstractmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: + def get_standard_metadata_args(cls) -> List[str]: + """Get the list of standard metadata arguments that are expected by the attention op.""" + raise NotImplementedError + + @classmethod + def get_prepare_extra_metadata_info( + cls, any_source_attn_node: Node + ) -> Tuple[Optional[PrepareMetadataCallable], int, List[Constant]]: """Get the prepare_metadata op. The prepare_metadata op should follow the below signature: ``` - def prepare_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, + def prepare_extra_metadata( + *desired_graph_inputs, # matched by arg names in the signature of the prepare_metadata op + *constant_inputs, # as returned by this function ) -> List[torch.Tensor]: ... ``` - The metadata should contain all necessary global information required for the underlying - attention op to process the input sequence and the returned list of tensors will be passed - on to each invocation of the attention op in the graph. + The metadata should contain all necessary extra global information required for the + underlying attention op to process the input sequence and the returned list of tensors will + be passed as additional arguments to each invocation of the attention op in the graph. - prepare_metadata is called once at the beginning of the forward pass. + This may not be needed for all attention ops if the standard metadata is sufficient. + + prepare_metadata is called once at the beginning of the forward pass for each attention op + detected in the graph. **Note that the prepare_metadata op should be a valid torch custom op, which comes with restrictions on the supported types in the signature.** + + Returns: + - prepare_metadata_op: The prepare_metadata op callable. + - num_meta_out: The number of extra metadata tensors to return. + - const_args: A list of constant arguments to pass to the prepare_metadata op. """ + return None, 0, [] @classmethod @abstractmethod @@ -878,15 +1213,16 @@ class AttentionDescriptor(ABC): If the buffer initializer requires information about the attention op, it can retrieve the necessary information from the source attention node. """ + return {} @classmethod - @abstractmethod def get_constants(cls, source_attn_node: Node) -> List[Constant]: """Provide a list of constant arguments to be passed to the attention op. The constant arguments are passed to the attention op as additional arguments after the caches and buffers. The constants are expected to be of type int, float, str, or None. """ + return [] class AttentionRegistry: diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py index b500e3e3dd..5cf4a4149c 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py @@ -5,7 +5,7 @@ Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484 Kernels are based on this repo: https://github.com/fla-org/flash-linear-attention """ -from typing import List, Tuple +from typing import List import torch from torch._ops import OpOverloadPacket @@ -21,82 +21,12 @@ from ..attention_interface import ( CacheInitializerDict, Constant, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) from .delta_rule.chunk import chunk_delta_rule_fwd from .delta_rule.fused_recurrent import fused_recurrent_delta_rule_fwd -@torch.library.custom_op("auto_deploy::fla_delta_prepare_metadata", mutates_args=()) -def fla_delta_prepare_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - """Prepare metadata for cached chunked delta rule. - - Returns a tuple of (cu_seq_lens, slot_idx_sanitized, use_initial_states, batch_info_tensor). - """ - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - cu_seqlens = torch.zeros(num_seq + 2, dtype=torch.int32, device=seq_len_sanitized.device) - - slot_idx_sanitized = slot_idx[:num_seq].clone().to(torch.long) - use_initial_states = input_pos[:num_seq] > 0 - - _, s = position_ids.shape[:2] - if s > 1: - prefill_mask = seq_len_sanitized > 1 - num_prefill = int(prefill_mask.sum().item()) - num_prefill_tokens = int(seq_len_sanitized[:num_prefill].sum().item()) - num_decode = num_seq - num_prefill - - # compute cu_seq_lens for the prefill sequences first - cu_seqlens[1 : num_prefill + 1] = torch.cumsum(seq_len_sanitized[:num_prefill], 0) - else: - num_prefill = 0 - num_prefill_tokens = 0 - num_decode = num_seq - - # decode is just arange... - cu_seqlens[num_prefill + 1 :] = torch.arange( - num_decode + 1, device=cu_seqlens.device, dtype=cu_seqlens.dtype - ) - batch_info_tensor = torch.tensor( - [num_prefill, num_prefill_tokens, num_decode], dtype=torch.int32 - ) - - return cu_seqlens, slot_idx_sanitized, use_initial_states, batch_info_tensor - - -@fla_delta_prepare_metadata.register_fake -def fla_delta_prepare_metadata_fake( - position_ids, - seq_len, - input_pos, - cache_loc, - pages_per_seq, - slot_idx, - page_size, - chunk_size, -): - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - cu_seq_lens = torch.empty(num_seq + 2, dtype=torch.int32, device=seq_len_sanitized.device) - return ( - cu_seq_lens, - torch.empty(num_seq, dtype=torch.long, device=slot_idx.device), - torch.empty(num_seq, dtype=torch.bool, device=slot_idx.device), - torch.empty(3, dtype=torch.int32), # host tensor - ) - - @torch.library.custom_op("auto_deploy::fla_cached_delta_rule", mutates_args=()) def fla_cached_delta_rule( # INPUTS (dense but may be flattened across sequences) @@ -104,11 +34,13 @@ def fla_cached_delta_rule( k: torch.Tensor, v: torch.Tensor, beta: torch.Tensor, - # METADATA - cu_seqlens: torch.Tensor, # [num_seq + 1] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] - batch_info_tensor: torch.Tensor, # [3] + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES delta_cache: torch.Tensor, # [max_batch_size, H, K, V] # CONSTANTS @@ -117,16 +49,22 @@ def fla_cached_delta_rule( b, s, num_heads, _ = q.shape # flatten it - q_flat = q.view(1, b * s, num_heads, -1) - k_flat = k.view(1, b * s, num_heads, -1) - v_flat = v.view(1, b * s, num_heads, -1) - beta_flat = beta.view(1, b * s, num_heads) + q_flat = q.view(b * s, num_heads, -1) + k_flat = k.view(b * s, num_heads, -1) + v_flat = v.view(b * s, num_heads, -1) + beta_flat = beta.view(b * s, num_heads) # pre-allocate output y = torch.empty_like(v, memory_format=torch.contiguous_format) - y_flat = y.view(1, b * s, num_heads, -1) + y_flat = y.view(b * s, num_heads, -1) - num_prefill, num_prefill_tokens, num_decode = batch_info_tensor.tolist() + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + + # clean up metadata + cu_seqlen_prefill = cu_seqlen[: num_prefill + 1] + slot_idx = slot_idx[:num_seq].to(torch.long) + use_initial_states = use_initial_states[:num_seq] if num_prefill > 0: initial_states = None @@ -138,17 +76,17 @@ def fla_cached_delta_rule( ) y_prefill, _, final_state = chunk_delta_rule_fwd( - q=q_flat[:, :num_prefill_tokens], - k=k_flat[:, :num_prefill_tokens], - v=v_flat[:, :num_prefill_tokens], - beta=beta_flat[:, :num_prefill_tokens], + q=q_flat[None, :num_prefill_tokens], + k=k_flat[None, :num_prefill_tokens], + v=v_flat[None, :num_prefill_tokens], + beta=beta_flat[None, :num_prefill_tokens], scale=scale, initial_state=initial_states, output_final_state=True, - cu_seqlens=cu_seqlens[: num_prefill + 1], + cu_seqlens=cu_seqlen_prefill, ) - y_flat[:, :num_prefill_tokens] = y_prefill.to(y_flat.dtype) + y_flat[None, :num_prefill_tokens] = y_prefill.to(y_flat.dtype) delta_cache.index_copy_(0, slot_idx[:num_prefill], final_state.to(delta_cache.dtype)) del y_prefill, initial_states, final_state @@ -157,17 +95,16 @@ def fla_cached_delta_rule( # NOTE: avoiding state clone here and adopting the kernel to handle # indexed initial states would give a boost y_decode, _, final_state = fused_recurrent_delta_rule_fwd( - q=q_flat[:, num_prefill_tokens:], - k=k_flat[:, num_prefill_tokens:], - v=v_flat[:, num_prefill_tokens:], - beta=beta_flat[:, num_prefill_tokens:], + q=q_flat[num_prefill_tokens:, None], + k=k_flat[num_prefill_tokens:, None], + v=v_flat[num_prefill_tokens:, None], + beta=beta_flat[num_prefill_tokens:, None], scale=scale, initial_state=delta_cache[slot_idx[num_prefill:]].clone(), output_final_state=True, - cu_seqlens=cu_seqlens[num_prefill + 1 :], ) - y_flat[:, num_prefill_tokens:] = y_decode.to(y_flat.dtype) + y_flat[num_prefill_tokens:, None] = y_decode.to(y_flat.dtype) delta_cache.index_copy_(0, slot_idx[num_prefill:], final_state.to(delta_cache.dtype)) del y_decode, final_state @@ -182,11 +119,13 @@ def fla_cached_delta_rule_fake( k: torch.Tensor, v: torch.Tensor, beta: torch.Tensor, - # METADATA - cu_seqlens: torch.Tensor, # [num_seq + 1] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] - batch_info_tensor: torch.Tensor, # [3] + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES delta_cache: torch.Tensor, # [max_batch_size, H, K, V] # CONSTANTS @@ -217,12 +156,11 @@ class FlaDeltaBackend(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.fla_cached_delta_rule + return torch.ops.auto_deploy.fla_cached_delta_rule.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - # Returns (cu_seq_lens, slot_idx, use_initial_states, batch_info_tensor) - return torch.ops.auto_deploy.fla_delta_prepare_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "cu_seqlen", "slot_idx", "use_initial_states"] @classmethod def get_cache_initializers( @@ -237,7 +175,7 @@ class FlaDeltaBackend(AttentionDescriptor): def _get_delta_cache(si: SequenceInfo): return torch.empty( - si.max_batch_size, + si.max_state_slots, num_heads, key_dim, value_dim, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py index 4a806dc1c6..24d6a2116d 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py @@ -157,13 +157,9 @@ _GlobalFlashInferPlanner = _FlashInferPlanner() @torch.library.custom_op("auto_deploy::flashinfer_attention_prepare_metadata", mutates_args=()) def prepare_flashinfer_metadata( position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + seq_len_with_cache: torch.Tensor, ) -> List[torch.Tensor]: """Prepare metadata for flashinfer attention. @@ -174,58 +170,36 @@ def prepare_flashinfer_metadata( # reset the planner _GlobalFlashInferPlanner.reset() - # retrieve sanitzed metadata - seq_len = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len) + # retrieve host-side metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + num_tokens = num_prefill_tokens + num_decode - # prepare flashinfer-style metadata - offsets = input_pos[:num_seq].clone() - - qo_indptr = torch.zeros(num_seq + 1, dtype=torch.int, device=seq_len.device) - qo_indptr[1:] = torch.cumsum(seq_len, 0) - - paged_kv_indptr = torch.zeros_like(qo_indptr) - paged_kv_indptr[1:] = torch.cumsum(pages_per_seq[:num_seq], 0) - - # NOTE: it is okay to clone cache_loc here without truncation. paged_kv_indptr is already - # truncated and will point to the correct sub range of cache_loc. - paged_kv_indices = cache_loc.clone() - - paged_kv_last_page_len = ((offsets + seq_len - 1) % page_size) + 1 + qo_indptr = cu_seqlen[: num_seq + 1] + # NOTE: in theory we could easily precompute batch_indices. And positions is just position_ids + # so we could skip that as well. However, we still need a place for resetting the planner and + # for now we keep it here since the kernel is fast # Compute batch_indices and positions so that they can be reused for kv cache appends # for all the layers batch_indices, positions = flashinfer.get_batch_indices_positions( - qo_indptr, - flashinfer.get_seq_lens(paged_kv_indptr, paged_kv_last_page_len, page_size), - position_ids.numel(), - ) - # return metadata - return ( - qo_indptr, - paged_kv_indptr, - paged_kv_indices, - paged_kv_last_page_len, - batch_indices, - positions, + qo_indptr, seq_len_with_cache[:num_seq], num_tokens ) + # return extra metadata + return batch_indices, positions -# TODO: Move the truncation of seq_len out of this custom op -# As SequenceInfo._get_sanitized_num_sequences could break in fake mode @prepare_flashinfer_metadata.register_fake def prepare_flashinfer_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size + position_ids: torch.Tensor, + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + seq_len_with_cache: torch.Tensor, ): - seq_len = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - qo_indptr = torch.empty(len(seq_len) + 1, dtype=seq_len.dtype, device=seq_len.device) + num_tokens = position_ids.shape[0] * position_ids.shape[1] return ( - qo_indptr, # qo_indptr - torch.empty_like(qo_indptr), # paged_kv_indptr - torch.empty_like(cache_loc), # paged_kv_indices - torch.empty_like(seq_len), # paged_kv_last_page_len - torch.empty_like(seq_len), # batch_indices - torch.empty_like(seq_len), # positions + torch.empty(num_tokens, dtype=torch.int32, device=position_ids.device), # batch_indices + torch.empty(num_tokens, dtype=torch.int32, device=position_ids.device), # positions ) @@ -235,13 +209,15 @@ def flashinfer_mha_with_cache( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - # METADATA - qo_indptr: torch.Tensor, - paged_kv_indptr: torch.Tensor, - paged_kv_indices: torch.Tensor, - paged_kv_last_page_len: torch.Tensor, - batch_indices: torch.Tensor, - positions: torch.Tensor, + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + cu_num_pages: torch.Tensor, + cache_loc: torch.Tensor, + last_page_len: torch.Tensor, + # EXTRA METADATA + flashinfer_batch_indices: torch.Tensor, + flashinfer_positions: torch.Tensor, # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, @@ -261,6 +237,18 @@ def flashinfer_mha_with_cache( k = k.reshape(b * s, -1, head_dim) v = v.reshape(b * s, -1, head_dim) + # convert to flashinfer-style metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + + qo_indptr = cu_seqlen[: num_seq + 1] + paged_kv_indptr = cu_num_pages[: num_seq + 1] + + # NOTE: it is okay to have cache_loc here without truncation. paged_kv_indptr will be + # truncated and will point to the correct sub range of cache_loc. + paged_kv_indices = cache_loc + paged_kv_last_page_len = last_page_len[:num_seq] + n_heads = q.shape[1] n_kv_heads = k.shape[1] @@ -286,8 +274,8 @@ def flashinfer_mha_with_cache( flashinfer.page.append_paged_kv_cache( k, v, - batch_indices, - positions, + flashinfer_batch_indices, + flashinfer_positions, (k_cache, v_cache), paged_kv_indices, paged_kv_indptr, @@ -316,13 +304,15 @@ def flashinfer_mha_with_cache_fake( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - # METADATA - qo_indptr: torch.Tensor, - paged_kv_indptr: torch.Tensor, - paged_kv_indices: torch.Tensor, - paged_kv_last_page_len: torch.Tensor, - batch_indices: torch.Tensor, - positions: torch.Tensor, + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + cu_num_pages: torch.Tensor, + cache_loc: torch.Tensor, + last_page_len: torch.Tensor, + # EXTRA METADATA + flashinfer_batch_indices: torch.Tensor, + flashinfer_positions: torch.Tensor, # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, @@ -364,11 +354,17 @@ class FlashInferAttention(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.flashinfer_attention_mha_with_cache + return torch.ops.auto_deploy.flashinfer_attention_mha_with_cache.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - return torch.ops.auto_deploy.flashinfer_attention_prepare_metadata, 6 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "cu_seqlen", "cu_num_pages", "cache_loc", "last_page_len"] + + @classmethod + def get_prepare_extra_metadata_info( + cls, any_source_attn_node: Node + ) -> Tuple[Optional[PrepareMetadataCallable], int, List[Constant]]: + return (torch.ops.auto_deploy.flashinfer_attention_prepare_metadata.default, 2, []) @classmethod def get_cache_initializers( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py index dc5e754c5b..29f62814c4 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py @@ -24,7 +24,7 @@ The flattened cached op integrates with the auto_deploy attention interface and updates a slot-indexed convolution state cache internally. """ -from typing import List, Optional, Tuple +from typing import List, Optional import torch from torch._ops import OpOverloadPacket @@ -38,88 +38,27 @@ from ..attention_interface import ( AttentionDescriptor, AttentionLayout, AttentionRegistry, - BufferInitializerDict, CacheConfig, CacheInitializerDict, Constant, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) -def _build_conv_state_from_sequence(input_bt_c: torch.Tensor, kernel_size: int) -> torch.Tensor: - """Builds a convolution state of fixed window `kernel_size` from a sequence. - - input_bt_c: [B, T, C] - Returns: [B, C, K] - """ - # [B, T, C] -> [B, C, T] - input_b_c_t = input_bt_c.transpose(1, 2) - seq_len = input_b_c_t.shape[-1] - if seq_len >= kernel_size: - return input_b_c_t[..., -kernel_size:] - pad_amount = kernel_size - seq_len - # F.pad last dim (time) with (pad_left, pad_right) - return torch.nn.functional.pad(input_b_c_t, (pad_amount, 0)) - - -# --------------------------------------------------------------- -# Metadata + flattened cached op that integrates with the AD i/f -# --------------------------------------------------------------- -@torch.library.custom_op("auto_deploy::cuda_causal_conv_prepare_metadata", mutates_args=()) -def cuda_causal_conv_prepare_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - """Prepare metadata for cached causal conv (CUDA backend). - - Returns a tuple of (seq_len_sanitized, seq_start, slot_idx_sanitized). - """ - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - - seq_start = torch.zeros_like(seq_len_sanitized) - if num_seq > 1: - seq_start[1:] = torch.cumsum(seq_len_sanitized[:-1], 0) - - slot_idx_sanitized = slot_idx[:num_seq].clone().to(torch.long) - # This is only used during prefill to determine if we should use the initial states from the cache. - use_initial_states = input_pos[:num_seq] > 0 - return (seq_len_sanitized, seq_start, slot_idx_sanitized, use_initial_states) - - -@cuda_causal_conv_prepare_metadata.register_fake -def cuda_causal_conv_prepare_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size -): - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - return ( - torch.empty_like(seq_len_sanitized), - torch.empty_like(seq_len_sanitized), - torch.empty(num_seq, dtype=torch.long, device=slot_idx.device), - torch.empty(num_seq, dtype=torch.bool, device=slot_idx.device), - ) - - @torch.library.custom_op("auto_deploy::cuda_cached_causal_conv1d", mutates_args={"input"}) def _cuda_cached_causal_conv1d( # INPUTS (dense but may be flattened across sequences) input: torch.Tensor, # [b, s, c_in] weight: torch.Tensor, # [c_out, c_in/groups, k] but we expect depthwise use: [c_in, k] bias: Optional[torch.Tensor], - # METADATA - seq_len: torch.Tensor, # [num_seq] - seq_start: torch.Tensor, # [num_seq] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES conv_state_cache: torch.Tensor, # [max_batch_size, c_in, k-1] # CONSTANTS @@ -140,16 +79,10 @@ def _cuda_cached_causal_conv1d( NOTE: This op modifies `input` in-place. """ b, s = input.shape[:2] - num_seq = seq_len.shape[0] - # Split by lengths: assume prefills first, decodes after - if s == 1: - num_prefill = 0 - num_decode = num_seq - else: - prefill_mask = seq_len > 1 - num_prefill = int(prefill_mask.sum().item()) - num_decode = num_seq - num_prefill + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + num_total_tokens = num_prefill_tokens + num_decode # Flatten tokens bs = b * s @@ -162,47 +95,29 @@ def _cuda_cached_causal_conv1d( else: w2d = weight - total_prefill_tokens = 0 - # PREFILL: concatenate all prefill tokens and run one varlen forward if num_prefill > 0: - seq_len_prefill = seq_len[:num_prefill].to(torch.int32) - total_prefill_tokens = int(seq_len_prefill.sum().item()) - # x_varlen: (dim, cu_seq_len) - x_varlen = inp_flat[:total_prefill_tokens].transpose(0, 1).contiguous() - - # Metadata - cu_seqlens = torch.cat( - [ - torch.zeros(1, dtype=torch.int32, device=input.device), - torch.cumsum(seq_len_prefill, dim=0, dtype=torch.int32), - ], - dim=0, - ).contiguous() - cache_indices = slot_idx[:num_prefill].to(torch.int32).contiguous() - has_initial_state = use_initial_states[:num_prefill].to(torch.bool) + x_varlen = inp_flat[:num_prefill_tokens].transpose(0, 1).contiguous() # Run varlen conv; updates conv_state_cache in-place per cache_indices y_varlen = causal_conv1d_fn( x_varlen, w2d, bias, - query_start_loc=cu_seqlens, - cache_indices=cache_indices, - has_initial_state=has_initial_state, + query_start_loc=cu_seqlen[: num_prefill + 1], + cache_indices=slot_idx[:num_prefill].to(torch.int32), + has_initial_state=use_initial_states[:num_prefill], conv_states=conv_state_cache, activation=activation, pad_slot_id=PAD_SLOT_ID, ) # (dim, total_prefill_tokens) # Scatter outputs back to input buffer - inp_flat[:total_prefill_tokens] = y_varlen.transpose(0, 1) + inp_flat[:num_prefill_tokens] = y_varlen.transpose(0, 1) # DECODE: batch update for single-token sequences if num_decode > 0: - x_decode = inp_flat[ - total_prefill_tokens : total_prefill_tokens + num_decode - ] # [num_decode, C_in] + x_decode = inp_flat[num_prefill_tokens:num_total_tokens] # [num_decode, C_in] causal_conv1d_update( x_decode, # [batch, dim] @@ -211,26 +126,26 @@ def _cuda_cached_causal_conv1d( bias, activation=activation, cache_seqlens=None, - conv_state_indices=slot_idx[num_prefill:].to(torch.int32), + conv_state_indices=slot_idx[num_prefill:num_seq].to(torch.int32), pad_slot_id=PAD_SLOT_ID, ) - return - @_cuda_cached_causal_conv1d.register_fake def _cuda_cached_causal_conv1d_fake( - # INPUTS - input: torch.Tensor, - weight: torch.Tensor, + # INPUTS (dense but may be flattened across sequences) + input: torch.Tensor, # [b, s, c_in] + weight: torch.Tensor, # [c_out, c_in/groups, k] but we expect depthwise use: [c_in, k] bias: Optional[torch.Tensor], - # METADATA - seq_len: torch.Tensor, - seq_start: torch.Tensor, + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, slot_idx: torch.Tensor, - use_initial_states: torch.Tensor, # [num_seq] + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES - conv_state_cache: torch.Tensor, + conv_state_cache: torch.Tensor, # [max_batch_size, c_in, k-1] # CONSTANTS stride: int, padding: int, @@ -238,11 +153,11 @@ def _cuda_cached_causal_conv1d_fake( groups: int, padding_mode: str, activation: Optional[str], -): - return +) -> None: + pass -def cuda_cached_causal_conv1d_wrapper(input, *args, **kwargs): +def cuda_cached_causal_conv1d_wrapper(input: torch.Tensor, *args, **kwargs) -> torch.Tensor: torch.ops.auto_deploy.cuda_cached_causal_conv1d(input, *args, **kwargs) return input @@ -266,16 +181,15 @@ class CudaBackendCausalConv(AttentionDescriptor): @classmethod def get_source_attention_op(cls) -> OpOverloadPacket: - return torch.ops.auto_deploy.torch_causal_conv1d + return torch.ops.auto_deploy.torch_causal_conv1d.default @classmethod def get_cached_attention_op(cls) -> MHACallable: return cuda_cached_causal_conv1d_wrapper @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - # Returns (seq_len, seq_start, slot_idx, use_initial_states) - return torch.ops.auto_deploy.cuda_causal_conv_prepare_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "cu_seqlen", "slot_idx", "use_initial_states"] @classmethod def get_cache_initializers( @@ -289,7 +203,7 @@ class CudaBackendCausalConv(AttentionDescriptor): def _get_conv_cache(si: SequenceInfo): return torch.empty( - si.max_batch_size, + si.max_state_slots, in_channels, max(1, kernel_size - 1), device=si.device, @@ -298,10 +212,6 @@ class CudaBackendCausalConv(AttentionDescriptor): return {"conv_state_cache": _get_conv_cache} - @classmethod - def get_global_buffer_initializers(cls, source_attn_node: Node) -> BufferInitializerDict: - return {} - @classmethod def get_constants(cls, source_attn_node: Node) -> List[Constant]: stride, padding, dilation, groups, padding_mode = extract_op_args( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py index 2483b92010..b055f22ded 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py @@ -26,7 +26,6 @@ from ..attention_interface import ( CacheInitializerDict, Constant, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) @@ -138,58 +137,23 @@ def _torch_causal_conv1d_decode( # --------------------------------------------------------------- -@torch.library.custom_op("auto_deploy::torch_causal_conv_prepare_metadata", mutates_args=()) -def torch_causal_conv_prepare_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - """Prepare metadata for cached causal conv. - - Returns a tuple of (seq_len_sanitized, seq_start, slot_idx_sanitized). - """ - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - - seq_start = torch.zeros_like(seq_len_sanitized) - if num_seq > 1: - seq_start[1:] = torch.cumsum(seq_len_sanitized[:-1], 0) - - slot_idx_sanitized = slot_idx[:num_seq].clone().to(torch.long) - use_initial_states = input_pos > 0 - return (seq_len_sanitized, seq_start, slot_idx_sanitized, use_initial_states) - - -@torch_causal_conv_prepare_metadata.register_fake -def torch_causal_conv_prepare_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size -): - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - return ( - torch.empty_like(seq_len_sanitized), - torch.empty_like(seq_len_sanitized), - torch.empty(num_seq, dtype=torch.long, device=slot_idx.device), - torch.empty(num_seq, dtype=torch.bool, device=slot_idx.device), - ) - - +# TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/8170): update torch +# reference implementation to support chunked prefill. +# Returns (seq_len, seq_start, slot_idx) @torch.library.custom_op("auto_deploy::torch_cached_causal_conv1d", mutates_args={}) def _torch_cached_causal_conv1d( # INPUTS (dense but may be flattened across sequences) input: torch.Tensor, # [b, s, c_in] weight: torch.Tensor, # [c_out, c_in/groups, k] bias: Optional[torch.Tensor], - # METADATA - seq_len: torch.Tensor, # [num_seq] - seq_start: torch.Tensor, # [num_seq] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] + # STANDARD METADATA + batch_info: torch.Tensor, + seq_len: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES conv_state_cache: torch.Tensor, # [max_batch_size, c_in, k] # CONSTANTS @@ -209,6 +173,14 @@ def _torch_cached_causal_conv1d( b, s = input.shape[:2] num_seq = seq_len.shape[0] + # get cleaned up metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + seq_len = seq_len[:num_seq] + seq_start = cu_seqlen[:num_seq] + slot_idx = slot_idx[:num_seq].to(torch.long) + use_initial_states = use_initial_states[:num_seq] + if s == 1: # Generate-only batch slot_idx_long = slot_idx.to(torch.long) @@ -270,17 +242,20 @@ def _torch_cached_causal_conv1d( @_torch_cached_causal_conv1d.register_fake def _torch_cached_causal_conv1d_fake( - # INPUTS - input: torch.Tensor, - weight: torch.Tensor, + # INPUTS (dense but may be flattened across sequences) + input: torch.Tensor, # [b, s, c_in] + weight: torch.Tensor, # [c_out, c_in/groups, k] bias: Optional[torch.Tensor], - # METADATA + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, slot_idx: torch.Tensor, - use_initial_states: torch.Tensor, # [num_seq] + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES - conv_state_cache: torch.Tensor, + conv_state_cache: torch.Tensor, # [max_batch_size, c_in, k] # CONSTANTS stride: int, padding: int, @@ -317,14 +292,11 @@ class TorchBackendCausalConv(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.torch_cached_causal_conv1d + return torch.ops.auto_deploy.torch_cached_causal_conv1d.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - # TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/8170): update torch - # reference implementation to support chunked prefill. - # Returns (seq_len, seq_start, slot_idx) - return torch.ops.auto_deploy.torch_causal_conv_prepare_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "seq_len", "cu_seqlen", "slot_idx", "use_initial_states"] @classmethod def get_cache_initializers( @@ -338,7 +310,7 @@ class TorchBackendCausalConv(AttentionDescriptor): def _get_conv_cache(si: SequenceInfo): return torch.empty( - si.max_batch_size, + si.max_state_slots, in_channels, kernel_size, device=si.device, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py index 79c68c2aac..e951805013 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py @@ -22,7 +22,6 @@ from ..attention_interface import ( CacheInitializerDict, Constant, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) from .torch_mamba import _torch_ssm_prefill @@ -111,52 +110,6 @@ def _update_ssm_state_cache(ssm_cache: torch.Tensor, ssm_state: torch.Tensor) -> # --------------------------------------------------------------- -@torch.library.custom_op("auto_deploy::torch_ssm_prepare_metadata", mutates_args=()) -def _torch_ssm_prepare_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - """Prepare metadata for cached SSM transform. - - Returns a tuple of (seq_len_sanitized, seq_start, slot_idx_sanitized). - """ - # Determine number of active sequences and compute seq_start boundaries - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - - seq_start = torch.zeros_like(seq_len_sanitized) - if num_seq > 1: - seq_start[1:] = torch.cumsum(seq_len_sanitized[:-1], 0) - - # Truncate slot indices to match active sequences - slot_idx_sanitized = slot_idx[:num_seq].clone().to(torch.long) - # TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/8170): update torch - # reference implementation to support chunked prefill. - use_initial_states = input_pos > 0 - return (seq_len_sanitized, seq_start, slot_idx_sanitized, use_initial_states) - - -@_torch_ssm_prepare_metadata.register_fake -def _torch_ssm_prepare_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size -): - # Use the same sanitization logic to determine sizes in fake mode - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - return ( - torch.empty_like(seq_len_sanitized), - torch.empty_like(seq_len_sanitized), - torch.empty(num_seq, dtype=torch.long, device=slot_idx.device), - torch.empty(num_seq, dtype=torch.bool, device=slot_idx.device), - ) - - @torch.library.custom_op("auto_deploy::torch_cached_ssm", mutates_args={}) def _torch_cached_ssm( # INPUTS (dense but may be flattened across sequences) @@ -167,11 +120,14 @@ def _torch_cached_ssm( D: torch.Tensor, # [num_heads] dt: torch.Tensor, # [b, s, num_heads] dt_bias: torch.Tensor, # [num_heads] - # METADATA - seq_len: torch.Tensor, # [num_seq] - seq_start: torch.Tensor, # [num_seq] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] + # STANDARD METADATA + batch_info: torch.Tensor, + seq_len: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES ssm_state_cache: torch.Tensor, # [max_batch_size, num_heads, head_dim, ssm_state_size] # CONSTANTS @@ -188,6 +144,14 @@ def _torch_cached_ssm( b, s = hidden_states.shape[:2] num_seq = seq_len.shape[0] + # get cleaned up metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + seq_len = seq_len[:num_seq] + seq_start = cu_seqlen[:num_seq] + slot_idx = slot_idx[:num_seq].to(torch.long) + use_initial_states = use_initial_states[:num_seq] + if s == 1: # Generate-only batch: gather cache slices for slots (already sanitized by metadata) slot_idx_long = slot_idx.to(torch.long) @@ -273,21 +237,24 @@ def _torch_cached_ssm( @_torch_cached_ssm.register_fake def _torch_cached_ssm_fake( - # INPUTS - hidden_states: torch.Tensor, - A: torch.Tensor, - B: torch.Tensor, - C: torch.Tensor, - D: torch.Tensor, - dt: torch.Tensor, - dt_bias: torch.Tensor, - # METADATA + # INPUTS (dense but may be flattened across sequences) + hidden_states: torch.Tensor, # [b, s, num_heads, head_dim] + A: torch.Tensor, # [num_heads] + B: torch.Tensor, # [b, s, n_groups, ssm_state_size] + C: torch.Tensor, # [b, s, n_groups, ssm_state_size] + D: torch.Tensor, # [num_heads] + dt: torch.Tensor, # [b, s, num_heads] + dt_bias: torch.Tensor, # [num_heads] + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, slot_idx: torch.Tensor, use_initial_states: torch.Tensor, + # EXTRA METADATA + # # CACHES - ssm_state_cache: torch.Tensor, + ssm_state_cache: torch.Tensor, # [max_batch_size, num_heads, head_dim, ssm_state_size] # CONSTANTS time_step_limit: List[float], chunk_size: int, @@ -322,12 +289,11 @@ class TorchBackendSSM(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.torch_cached_ssm + return torch.ops.auto_deploy.torch_cached_ssm.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - # Returns (seq_len, seq_start, slot_idx) - return torch.ops.auto_deploy.torch_ssm_prepare_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "seq_len", "cu_seqlen", "slot_idx", "use_initial_states"] @classmethod def get_cache_initializers( @@ -353,7 +319,7 @@ class TorchBackendSSM(AttentionDescriptor): def _get_ssm_cache(si: SequenceInfo): return torch.empty( - si.max_batch_size, + si.max_state_slots, num_heads, head_dim, ssm_state_size, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py index ff86ac8f5c..d3ea70221b 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py @@ -29,7 +29,6 @@ from ..attention_interface import ( AttentionDescriptor, AttentionLayout, AttentionRegistry, - BufferInitializerDict, CacheConfig, CacheInitializerDict, Constant, @@ -41,124 +40,63 @@ from ..attention_interface import ( @torch.library.custom_op("auto_deploy::triton_ssm_prepare_metadata", mutates_args=()) def _triton_ssm_prepare_metadata( + # INPUTS position_ids: torch.Tensor, + batch_info: torch.Tensor, seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, + cu_seqlen: torch.Tensor, + # EXTRA METADATA PROVIDED BY THE DESCRIPTOR chunk_size: int, ) -> List[torch.Tensor]: """Prepare metadata for cached SSM transform. Returns a tuple of (seq_len_sanitized, seq_start, slot_idx_sanitized). """ - # Determine number of active sequences and compute seq_start boundaries - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) + device = cu_seqlen.device + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() - # Truncate slot indices to match active sequences - slot_idx_sanitized = slot_idx[:num_seq].clone().to(torch.long) - # TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/8170): update torch - # reference implementation to support chunked prefill. - use_initial_states = input_pos[:num_seq] > 0 - - device = position_ids.device - - chunk_indices = torch.zeros(num_seq, dtype=torch.int32, device=device) - chunk_offsets = torch.zeros(num_seq, dtype=torch.int32, device=device) - cu_seqlens = torch.zeros(num_seq + 1, dtype=torch.int32, device=device) - _, s = position_ids.shape[:2] - if s > 1: - # only compute chunk indices and offsets for prefill. - prefill_mask = seq_len_sanitized > 1 - num_prefill = int(prefill_mask.sum().item()) - num_prefill_tokens = int(seq_len_sanitized[:num_prefill].sum().item()) - num_decode = num_seq - num_prefill - cu_seqlens = torch.cat( - [ - torch.zeros(1, dtype=torch.int32, device=device), - torch.cumsum(seq_len_sanitized[:num_prefill].to(torch.int32), dim=0), - ], - dim=0, + if num_prefill > 0: + chunk_indices, chunk_offsets = cu_seqlens_to_chunk_indices_offsets( + cu_seqlen[: num_prefill + 1], chunk_size ) - chunk_indices, chunk_offsets = cu_seqlens_to_chunk_indices_offsets(cu_seqlens, chunk_size) seq_idx_prefill = torch.repeat_interleave( - torch.arange(num_prefill, device=device, dtype=torch.int32), - seq_len_sanitized[:num_prefill], + torch.arange(num_prefill, device=device, dtype=torch.int32), seq_len[:num_prefill] ).view(1, -1) else: - num_prefill = 0 - num_prefill_tokens = 0 - num_decode = num_seq + chunk_indices = torch.empty(0, dtype=torch.int32, device=device) + chunk_offsets = torch.empty(0, dtype=torch.int32, device=device) seq_idx_prefill = torch.empty(1, 0, dtype=torch.int32, device=device) - batch_info_tensor = torch.tensor( - [num_prefill, num_prefill_tokens, num_decode], dtype=torch.int32 - ) # host tensor - return ( - seq_len_sanitized, - slot_idx_sanitized, - use_initial_states, - cu_seqlens, - chunk_indices, - chunk_offsets, - seq_idx_prefill, - batch_info_tensor, - ) + return (chunk_indices, chunk_offsets, seq_idx_prefill) @_triton_ssm_prepare_metadata.register_fake def _triton_ssm_prepare_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size + # INPUTS + position_ids: torch.Tensor, + batch_info: torch.Tensor, + seq_len: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA PROVIDED BY THE DESCRIPTOR + chunk_size: int, ): - # Use the same sanitization logic to determine sizes in fake mode - seq_len_sanitized = SequenceInfo._get_sanitized_seq_len(position_ids, seq_len) - num_seq = len(seq_len_sanitized) - device = slot_idx.device - # Always-correct shapes - seq_len_fake = torch.empty_like(seq_len_sanitized) - slot_idx_fake = torch.empty(num_seq, dtype=torch.long, device=device) - use_initial_states_fake = torch.empty(num_seq, dtype=torch.bool, device=device) - cu_seqlens_fake = torch.empty(num_seq + 1, dtype=torch.int32, device=device) - - # Token-dependent shapes (prefill vs decode) - _, s = position_ids.shape[:2] + b, s = position_ids.shape[:2] + num_tokens = b * s + device = cu_seqlen.device + dtype = torch.int32 if s > 1: - prefill_mask = seq_len_sanitized > 1 - num_prefill = int(prefill_mask.sum().item()) - num_prefill_tokens = int(seq_len_sanitized[:num_prefill].sum().item()) - cu_seqlens_runtime = torch.cat( - [ - torch.zeros(1, dtype=torch.int32, device=device), - torch.cumsum(seq_len_sanitized[:num_prefill].to(torch.int32), dim=0), - ], - dim=0, + # NOTE: this is only an upper bound for the shape in this case... + return ( + torch.empty(num_tokens, dtype=dtype, device=device), # chunk_indices + torch.empty(num_tokens, dtype=dtype, device=device), # chunk_offsets + torch.empty(1, num_tokens, dtype=dtype, device=device), # seq_idx_prefill ) - chunk_indices_rt, chunk_offsets_rt = cu_seqlens_to_chunk_indices_offsets( - cu_seqlens_runtime, chunk_size - ) - chunk_indices_fake = torch.empty_like(chunk_indices_rt) - chunk_offsets_fake = torch.empty_like(chunk_offsets_rt) - seq_idx_prefill_fake = torch.empty(1, num_prefill_tokens, dtype=torch.int32, device=device) else: - chunk_indices_fake = torch.empty(0, dtype=torch.int32, device=device) - chunk_offsets_fake = torch.empty(0, dtype=torch.int32, device=device) - seq_idx_prefill_fake = torch.empty(1, 0, dtype=torch.int32, device=device) - - batch_info_tensor_fake = torch.empty(3, dtype=torch.int32) - - return ( - seq_len_fake, - slot_idx_fake, - use_initial_states_fake, - cu_seqlens_fake, - chunk_indices_fake, - chunk_offsets_fake, - seq_idx_prefill_fake, - batch_info_tensor_fake, - ) + return ( + torch.empty(0, dtype=dtype, device=device), # chunk_indices + torch.empty(0, dtype=dtype, device=device), # chunk_offsets + torch.empty(1, 0, dtype=dtype, device=device), # seq_idx_prefill + ) @torch.library.custom_op("auto_deploy::triton_cached_ssm", mutates_args={}) @@ -171,15 +109,15 @@ def _triton_cached_ssm( D: torch.Tensor, # [num_heads] dt: torch.Tensor, # [b, s, num_heads] dt_bias: torch.Tensor, # [num_heads] - # METADATA - seq_len: torch.Tensor, # [num_seq] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] - cu_seqlens: torch.Tensor, # [num_seq + 1] - chunk_indices: torch.Tensor, # [num_seq + 1] - chunk_offsets: torch.Tensor, # [num_seq + 1] - seq_idx_prefill: torch.Tensor, # [1, num_prefill] - batch_info_tensor: torch.Tensor, # [3] + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + chunk_indices: torch.Tensor, # [num_logical_chunks] + chunk_offsets: torch.Tensor, # [num_logical_chunks] + seq_idx_prefill: torch.Tensor, # [1, num_prefill_tokens] # CACHES ssm_state_cache: torch.Tensor, # [max_batch_size, num_heads, head_dim, ssm_state_size] # CONSTANTS @@ -202,7 +140,9 @@ def _triton_cached_ssm( ssm_state_size = B.shape[3] - num_prefill, num_prefill_tokens, num_decode = batch_info_tensor.tolist() + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + num_total_tokens = num_prefill_tokens + num_decode y_prefill = None y_decode = None @@ -239,7 +179,7 @@ def _triton_cached_ssm( seq_idx=seq_idx_prefill, chunk_indices=chunk_indices, chunk_offsets=chunk_offsets, - cu_seqlens=cu_seqlens, + cu_seqlens=cu_seqlen[: num_prefill + 1], dt_softplus=True, dt_limit=(time_step_limit[0], time_step_limit[1]), return_final_states=False, @@ -253,12 +193,12 @@ def _triton_cached_ssm( # Decode: batch single-token updates via selective_state_update if num_decode > 0: - slot_idx_decode = slot_idx[num_prefill:] + slot_idx_decode = slot_idx[num_prefill:num_seq] - x_decode = hs_flat[num_prefill_tokens : num_prefill_tokens + num_decode] # [nd, H, D] - B_decode = B_flat[num_prefill_tokens : num_prefill_tokens + num_decode] # [nd, G, N] - C_decode = C_flat[num_prefill_tokens : num_prefill_tokens + num_decode] # [nd, G, N] - dt_decode = dt_flat[num_prefill_tokens : num_prefill_tokens + num_decode] # [nd, H] + x_decode = hs_flat[num_prefill_tokens:num_total_tokens] # [nd, H, D] + B_decode = B_flat[num_prefill_tokens:num_total_tokens] # [nd, G, N] + C_decode = C_flat[num_prefill_tokens:num_total_tokens] # [nd, G, N] + dt_decode = dt_flat[num_prefill_tokens:num_total_tokens] # [nd, H] dt_hp = dt_decode[:, :, None].expand(-1, num_heads, head_dim) dt_bias_hp = dt_bias[..., None].expand(num_heads, head_dim) @@ -284,7 +224,7 @@ def _triton_cached_ssm( y = torch.empty_like(hidden_states, memory_format=torch.contiguous_format) y_flat = y.view(bs, *y.shape[2:]) y_flat[:num_prefill_tokens].copy_(y_prefill[0]) - y_flat[num_prefill_tokens : num_prefill_tokens + num_decode].copy_(y_decode) + y_flat[num_prefill_tokens:num_total_tokens].copy_(y_decode) return y elif num_prefill > 0: return y_prefill[0].view(b, s, num_heads, head_dim).to(hidden_states.dtype) @@ -304,15 +244,15 @@ def _triton_cached_ssm_fake( D: torch.Tensor, # [num_heads] dt: torch.Tensor, # [b, s, num_heads] dt_bias: torch.Tensor, # [num_heads] - # METADATA - seq_len: torch.Tensor, # [num_seq] - slot_idx: torch.Tensor, # [num_seq] - use_initial_states: torch.Tensor, # [num_seq] - cu_seqlens: torch.Tensor, # [num_seq + 1] - chunk_indices: torch.Tensor, # [num_seq + 1] - chunk_offsets: torch.Tensor, # [num_seq + 1] - seq_idx_prefill: torch.Tensor, # [1, num_prefill] - batch_info_tensor: torch.Tensor, # [3] + # STANDARD METADATA + batch_info: torch.Tensor, + cu_seqlen: torch.Tensor, + slot_idx: torch.Tensor, + use_initial_states: torch.Tensor, + # EXTRA METADATA + chunk_indices: torch.Tensor, # [num_logical_chunks] + chunk_offsets: torch.Tensor, # [num_logical_chunks] + seq_idx_prefill: torch.Tensor, # [1, num_prefill_tokens] # CACHES ssm_state_cache: torch.Tensor, # [max_batch_size, num_heads, head_dim, ssm_state_size] # CONSTANTS @@ -327,7 +267,6 @@ def _triton_cached_ssm_fake( ) -# TODO: consider inheriting from TorchBackendSSM instead of redefining everything @AttentionRegistry.register("triton_ssm") class TritonBackendSSM(AttentionDescriptor): @classmethod @@ -351,13 +290,21 @@ class TritonBackendSSM(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.triton_cached_ssm + return torch.ops.auto_deploy.triton_cached_ssm.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - # Returns: seq_len, slot_idx, use_initial_states, - # cu_seqlens, chunk_indices, chunk_offsets, seq_idx_prefill, batch_info_tensor - return torch.ops.auto_deploy.triton_ssm_prepare_metadata, 8 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "cu_seqlen", "slot_idx", "use_initial_states"] + + @classmethod + def get_prepare_extra_metadata_info( + cls, any_source_attn_node: Node + ) -> Tuple[PrepareMetadataCallable, int, List[Constant]]: + return ( + torch.ops.auto_deploy.triton_ssm_prepare_metadata.default, + 3, # chunk_indices, chunk_offsets, seq_idx_prefill + extract_op_args(any_source_attn_node, "chunk_size"), + ) @classmethod def get_cache_initializers( @@ -380,7 +327,7 @@ class TritonBackendSSM(AttentionDescriptor): def _get_ssm_cache(si: SequenceInfo): return torch.empty( - si.max_batch_size, + si.max_state_slots, num_heads, head_dim, ssm_state_size, @@ -390,10 +337,6 @@ class TritonBackendSSM(AttentionDescriptor): return {"ssm_state_cache": _get_ssm_cache} - @classmethod - def get_global_buffer_initializers(cls, source_attn_node: Node) -> BufferInitializerDict: - return {} - @classmethod def get_constants(cls, source_attn_node: Node) -> List[Constant]: time_step_limit, chunk_size = extract_op_args( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py index 2a0748783f..0521215100 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py @@ -1,6 +1,6 @@ """Custom ops for MultiHead Latent attention.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Union import torch from torch._ops import OpOverloadPacket @@ -14,7 +14,6 @@ from .attention_interface import ( CacheConfig, CacheInitializerDict, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) from .triton_attention import _flattened_context_mha, _generate_mha @@ -31,11 +30,14 @@ def fused_flattened_mla_with_cache( q_pe: torch.Tensor, kv: torch.Tensor, k_pe: torch.Tensor, - # METADATA + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, input_pos: torch.Tensor, cache_loc: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA + # # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, @@ -52,6 +54,15 @@ def fused_flattened_mla_with_cache( # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences # and number of tokens per sequence are encoded in seq_len and seq_start. + # check for sequence info and truncate metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + + seq_len = seq_len[:num_seq] + input_pos = input_pos[:num_seq] + cache_loc = cache_loc[:num_seq] + seq_start = cu_seqlen[:num_seq] + # Get parameters b, num_heads, s, qk_nope_head_dim = q_nope.shape qk_rope_head_dim = q_pe.shape[-1] @@ -154,11 +165,14 @@ def fused_flattened_mla_with_cache_fake( q_pe: torch.Tensor, kv: torch.Tensor, k_pe: torch.Tensor, - # METADATA + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, input_pos: torch.Tensor, cache_loc: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA + # # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, @@ -171,42 +185,6 @@ def fused_flattened_mla_with_cache_fake( return torch.empty_like(kv[..., -v_head_dim:]) -@torch.library.custom_op( - "auto_deploy::triton_attention_prepare_fused_mla_metadata", mutates_args=() -) -def prepare_fused_mla_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - num_seq = SequenceInfo._get_sanitized_num_sequences(position_ids, seq_len) - seq_start = torch.zeros_like(seq_len[:num_seq]) - seq_start[1:] = torch.cumsum(seq_len[: num_seq - 1], 0) - return ( - seq_len[:num_seq].clone(), - input_pos[:num_seq].clone(), - cache_loc[:num_seq].clone(), - seq_start, - ) - - -@prepare_fused_mla_metadata.register_fake -def prepare_fused_mla_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size -): - return ( - torch.empty_like(seq_len), - torch.empty_like(input_pos), - torch.empty_like(cache_loc), - torch.empty_like(seq_len), - ) - - @AttentionRegistry.register("MultiHeadLatentAttention") class MultiHeadLatentAttention(AttentionDescriptor): @classmethod @@ -230,11 +208,11 @@ class MultiHeadLatentAttention(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.triton_attention_fused_flattened_mla_with_cache + return torch.ops.auto_deploy.triton_attention_fused_flattened_mla_with_cache.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - return torch.ops.auto_deploy.triton_attention_prepare_fused_mla_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "seq_len", "input_pos", "cache_loc", "cu_seqlen"] @classmethod def get_cache_initializers( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py index ddfd093d5c..cab0a0302b 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py @@ -1,7 +1,7 @@ """Torch backend attention using pure PyTorch reference implementations.""" import math -from typing import List, Optional, Tuple +from typing import List, Optional import torch from torch._ops import OpOverloadPacket @@ -19,7 +19,6 @@ from .attention_interface import ( CacheInitializerDict, Constant, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) from .torch_attention import repeat_kv, update_kv_cache @@ -253,11 +252,14 @@ def torch_backend_mha_with_cache( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - # METADATA + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, input_pos: torch.Tensor, cache_loc: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA + # # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, @@ -275,6 +277,14 @@ def torch_backend_mha_with_cache( v_head_dim = v_cache.shape[-1] b, s = q.shape[:2] + # get cleaned up metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + seq_len = seq_len[:num_seq] + input_pos = input_pos[:num_seq] + cache_loc = cache_loc[:num_seq] + seq_start = cu_seqlen[:num_seq] + # check for num_heads num_heads = q.shape[2] // qk_head_dim if q.ndim == 3 else q.shape[2] @@ -337,15 +347,24 @@ def torch_backend_mha_with_cache( @torch_backend_mha_with_cache.register_fake def torch_backend_mha_with_cache_fake( + # Q, K, V q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, input_pos: torch.Tensor, cache_loc: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA + # + # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, + # BUFFERS + # + # CONSTANTS scale: Optional[float], sinks: Optional[torch.Tensor] = None, sliding_window_size: Optional[int] = None, @@ -354,42 +373,6 @@ def torch_backend_mha_with_cache_fake( return q.new_empty(*q.shape[:-1], v.shape[-1]).contiguous() -@torch.library.custom_op("auto_deploy::torch_cached_attention_prepare_metadata", mutates_args=()) -def torch_backend_prepare_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - """Prepare metadata for torch backend attention (similar to triton backend).""" - num_seq = SequenceInfo._get_sanitized_num_sequences(position_ids, seq_len) - seq_start = torch.zeros_like(seq_len[:num_seq]) - seq_start[1:] = torch.cumsum(seq_len[: num_seq - 1], 0) - return ( - seq_len[:num_seq].clone(), - input_pos[:num_seq].clone(), - cache_loc[:num_seq].clone(), - seq_start, - ) - - -@torch_backend_prepare_metadata.register_fake -def torch_backend_prepare_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size -): - num_seq = SequenceInfo._get_sanitized_num_sequences(position_ids, seq_len) - return ( - torch.empty_like(seq_len[:num_seq]), - torch.empty_like(input_pos[:num_seq]), - torch.empty_like(cache_loc[:num_seq]), - torch.empty_like(seq_len[:num_seq]), - ) - - @AttentionRegistry.register("torch") class TorchBackendAttention(AttentionDescriptor): @classmethod @@ -413,11 +396,11 @@ class TorchBackendAttention(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.torch_cached_attention_with_cache + return torch.ops.auto_deploy.torch_cached_attention_with_cache.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - return torch.ops.auto_deploy.torch_cached_attention_prepare_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "seq_len", "input_pos", "cache_loc", "cu_seqlen"] @classmethod def get_cache_initializers( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py index 1ca4a60584..5a25b1f1c9 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py @@ -1,7 +1,7 @@ """Custom ops for MHA/XQA attention.""" import math -from typing import List, Optional, Tuple +from typing import List, Optional import torch import triton @@ -20,7 +20,6 @@ from .attention_interface import ( CacheInitializerDict, Constant, MHACallable, - PrepareMetadataCallable, SequenceInfo, ) from .triton_kernels.attention_with_kv_cache import ( @@ -188,11 +187,14 @@ def flattened_mha_with_cache( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - # METADATA + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, input_pos: torch.Tensor, cache_loc: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA + # # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, @@ -207,6 +209,15 @@ def flattened_mha_with_cache( NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH. """ + # check for sequence info and truncate metadata + num_prefill, num_prefill_tokens, num_decode = batch_info.tolist() + num_seq = num_prefill + num_decode + + seq_len = seq_len[:num_seq] + input_pos = input_pos[:num_seq] + cache_loc = cache_loc[:num_seq] + seq_start = cu_seqlen[:num_seq] + # b, s info # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences. # Generally speaking, we expect one of two cases here: @@ -239,7 +250,17 @@ def flattened_mha_with_cache( if s == 1: # generate-only phase _generate_mha( - q, k, v, k_cache, v_cache, cache_loc, input_pos, scale, y, sinks, sliding_window + q, + k, + v, + k_cache, + v_cache, + cache_loc, + input_pos, + scale, + y, + sinks, + sliding_window, ) else: # mixed context + generate phase @@ -264,15 +285,24 @@ def flattened_mha_with_cache( @flattened_mha_with_cache.register_fake def flattened_mha_fake( + # Q, K, V q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + # STANDARD METADATA + batch_info: torch.Tensor, seq_len: torch.Tensor, input_pos: torch.Tensor, cache_loc: torch.Tensor, - seq_start: torch.Tensor, + cu_seqlen: torch.Tensor, + # EXTRA METADATA + # + # CACHES k_cache: torch.Tensor, v_cache: torch.Tensor, + # BUFFERS + # + # CONSTANTS scale: Optional[float], sinks: Optional[torch.Tensor] = None, sliding_window: Optional[int] = None, @@ -280,46 +310,6 @@ def flattened_mha_fake( return q.new_empty(*q.shape[:-1], v.shape[-1]).contiguous() -@torch.library.custom_op( - "auto_deploy::triton_attention_prepare_fused_mha_metadata", mutates_args=() -) -def prepare_fused_mha_metadata( - position_ids: torch.Tensor, - seq_len: torch.Tensor, - input_pos: torch.Tensor, - cache_loc: torch.Tensor, - pages_per_seq: torch.Tensor, - slot_idx: torch.Tensor, - page_size: int, - chunk_size: int, -) -> List[torch.Tensor]: - # TODO: maybe use slot_idx instead of pages_per_seq?? - num_seq = SequenceInfo._get_sanitized_num_sequences(position_ids, seq_len) - seq_start = torch.zeros_like(seq_len[:num_seq]) - seq_start[1:] = torch.cumsum(seq_len[: num_seq - 1], 0) - return ( - seq_len[:num_seq].clone(), - input_pos[:num_seq].clone(), - cache_loc[:num_seq].clone(), - seq_start, - ) - - -# TODO: Move the truncation of inputs out of this custom op -# SequenceInfo._get_sanitized_num_sequences could break in fake mode -@prepare_fused_mha_metadata.register_fake -def prepare_fused_mha_metadata_fake( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size -): - num_seq = SequenceInfo._get_sanitized_num_sequences(position_ids, seq_len) - return ( - torch.empty_like(seq_len[:num_seq]), - torch.empty_like(input_pos[:num_seq]), - torch.empty_like(cache_loc[:num_seq]), - torch.empty_like(seq_len[:num_seq]), - ) - - @AttentionRegistry.register("triton") class TritonAttention(AttentionDescriptor): @classmethod @@ -343,11 +333,11 @@ class TritonAttention(AttentionDescriptor): @classmethod def get_cached_attention_op(cls) -> MHACallable: - return torch.ops.auto_deploy.triton_attention_flattened_mha_with_cache + return torch.ops.auto_deploy.triton_attention_flattened_mha_with_cache.default @classmethod - def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]: - return torch.ops.auto_deploy.triton_attention_prepare_fused_mha_metadata, 4 + def get_standard_metadata_args(cls) -> List[str]: + return ["batch_info", "seq_len", "input_pos", "cache_loc", "cu_seqlen"] @classmethod def get_cache_initializers( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py new file mode 100644 index 0000000000..f65a570bc1 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py @@ -0,0 +1,86 @@ +"""Triton utility operations for auto_deploy.""" + +import torch +import triton +import triton.language as tl + + +@triton.jit +def _fused_gather_scatter_kernel( + ungathered_ptr, # *T + gather_ids_ptr, # *int64 + mask_indices_ptr, # *int64 + out_ptr, # *T + n_elements, # int32 + BLOCK_SIZE: tl.constexpr, +): + """Triton kernel for fused gather and scatter operation. + + This kernel gathers values from `ungathered_ptr` using indices from `gather_ids_ptr` + and scatters them to `out_ptr` at positions specified by `mask_indices_ptr`. + """ + pid = tl.program_id(0) + offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offs < n_elements + + # load source indices + src_idx = tl.load(gather_ids_ptr + offs, mask=mask, other=0) + # load values from ungathered + vals = tl.load(ungathered_ptr + src_idx, mask=mask, other=0) + + # load destination indices (into flattened output) + dst_idx = tl.load(mask_indices_ptr + offs, mask=mask, other=0) + + # scatter values to output + tl.store(out_ptr + dst_idx, vals, mask=mask) + + +@torch.library.custom_op("auto_deploy::triton_utils_fused_gather_scatter", mutates_args=("out",)) +def fused_gather_scatter( + ungathered_input: torch.Tensor, + gather_ids: torch.Tensor, + mask_indices: torch.Tensor, + out: torch.Tensor, +) -> None: + """Fused gather and scatter operation using Triton. + + This operation gathers values from `ungathered_input` at indices specified by + `gather_ids` and scatters the gathered values to `out` at positions specified + by `mask_indices`. + + This is useful for efficiently rearranging input_ids in overlap scheduling + scenarios where tokens need to be reordered based on scheduling decisions. + + Args: + ungathered_input: Source tensor from which to gather values. + gather_ids: Indices into `ungathered_input` specifying which values to gather. + mask_indices: Destination indices in `out` where gathered values should be scattered. + out: Output tensor where gathered values will be scattered. + + Note: + This operation mutates `out` in-place. + """ + n = gather_ids.numel() + + BLOCK_SIZE = 256 + grid = ((n + BLOCK_SIZE - 1) // BLOCK_SIZE,) + + _fused_gather_scatter_kernel[grid]( + ungathered_input, # ungathered_ptr + gather_ids, # gather_ids_ptr + mask_indices, # mask_indices_ptr + out, # out_ptr + n, # n_elements + BLOCK_SIZE=BLOCK_SIZE, + ) + + +@fused_gather_scatter.register_fake +def fused_gather_scatter_fake( + ungathered_input: torch.Tensor, + gather_ids: torch.Tensor, + mask_indices: torch.Tensor, + out: torch.Tensor, +) -> None: + """Fake implementation for torch.compile / graph tracing.""" + pass diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index 40eb227f95..ddaa64c3e2 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -268,6 +268,22 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings): return self + @model_validator(mode="after") + def update_cuda_graph_batch_sizes(self): + # if not set, use heuristic + if self.cuda_graph_batch_sizes is None: + cg_bs = {1, self.max_batch_size} + cg_bs.update(range(1, 128 + 1, 16)) + cg_bs.update(range(128, self.max_batch_size + 1, 128)) + else: + cg_bs = [b for b in self.cuda_graph_batch_sizes if b <= self.max_batch_size] + self.cuda_graph_batch_sizes = sorted(cg_bs, reverse=True) + ad_logger.info(f"Using cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}") + + # ensure that the cuda_graph_batch_sizes are updated in the shortcut and transform config + self.update_transforms_with_shortcuts() + return self + @field_validator("kv_cache_config", mode="after") @classmethod def validate_kv_cache_config(cls, kv_cache_config: KvCacheConfig) -> KvCacheConfig: @@ -308,6 +324,9 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings): kwargs.pop("yaml_default") return kwargs + def is_cuda_graph_enabled(self) -> bool: + return self.compile_backend in ["torch-cudagraph", "torch-opt"] + ### PRIVATE METHODS ############################################################################ @classmethod def _get_yaml_default_from_mode(cls, mode: Optional[str]) -> Optional[str]: diff --git a/tensorrt_llm/_torch/auto_deploy/models/factory.py b/tensorrt_llm/_torch/auto_deploy/models/factory.py index 71b4b8b2c5..b5fb106e10 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/factory.py +++ b/tensorrt_llm/_torch/auto_deploy/models/factory.py @@ -194,11 +194,6 @@ class ModelFactory(ABC): """Returns the sharding config for this model.""" return self._sharding_config - @property - def chunk_size(self) -> Optional[int]: - """Returns the chunk size for this model.""" - return None - def get_cache_config(self) -> CacheConfig: """Return the cache configuration for the model. diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py index 00cde0dd31..af747e74c9 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/hf.py +++ b/tensorrt_llm/_torch/auto_deploy/models/hf.py @@ -141,13 +141,6 @@ class AutoModelForCausalLMFactory(AutoModelFactory): model_config, _ = self._get_model_config() return getattr(model_config, "vocab_size", None) - @property - def chunk_size(self) -> Optional[int]: - """Returns the chunk size for this model.""" - model_config, _ = self._get_model_config() - # chunk_size is an input to a custom op, so it can not be none. We set it to a default value of 128. - return getattr(model_config, "chunk_size", 128) - def _recursive_update_config( self, config: PretrainedConfig, update_dict: Dict[str, Any] ) -> Tuple[PretrainedConfig, Dict[str, Any]]: diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py index 85e997c615..93090a8778 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py @@ -44,11 +44,22 @@ def _bamba_mixer_torch_forward( if use_caching: # Prepare dense metadata for cached flattened op seq_len_t = torch.full((batch_size,), seq_len, device=input_states.device, dtype=torch.int) - seq_start_t = torch.arange( + cu_seqlen_t = torch.arange( 0, batch_size * seq_len, seq_len, device=input_states.device, dtype=torch.int ) slot_idx_t = torch.arange(batch_size, device=input_states.device, dtype=torch.long) use_initial_states_t = torch.zeros(batch_size, device=input_states.device, dtype=torch.bool) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For context phase (seq_len > 1): [batch_size, batch_size * seq_len, 0] + # For generate phase (seq_len == 1): [0, 0, batch_size] + if seq_len == 1: + batch_info_t = torch.tensor( + [0, 0, batch_size], device=input_states.device, dtype=torch.int32 + ) + else: + batch_info_t = torch.tensor( + [batch_size, batch_size * seq_len, 0], device=input_states.device, dtype=torch.int32 + ) if use_caching: hidden_states_B_C = self.act( torch.ops.auto_deploy.torch_cached_causal_conv1d( @@ -56,9 +67,10 @@ def _bamba_mixer_torch_forward( hidden_states_B_C, self.conv1d.weight, self.conv1d.bias, - # METADATA + # STANDARD METADATA + batch_info_t, seq_len_t, - seq_start_t, + cu_seqlen_t, slot_idx_t, use_initial_states_t, # CACHES @@ -110,9 +122,10 @@ def _bamba_mixer_torch_forward( D=self.D, dt=dt, dt_bias=self.dt_bias, - # METADATA + # STANDARD METADATA + batch_info=batch_info_t, seq_len=seq_len_t, - seq_start=seq_start_t, + cu_seqlen=cu_seqlen_t, slot_idx=slot_idx_t, use_initial_states=use_initial_states_t, # CACHES diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index d4eab7131a..446f6d41ee 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -25,8 +25,9 @@ from tensorrt_llm._torch.pyexecutor._util import ( get_decoding_mode, get_kv_cache_manager_cls, ) +from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDA_GRAPH_DUMMY_REQUEST_ID from tensorrt_llm._torch.pyexecutor.guided_decoder import GuidedDecoder -from tensorrt_llm._torch.pyexecutor.llm_request import get_draft_token_length +from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest, get_draft_token_length from tensorrt_llm._torch.pyexecutor.py_executor_creator import get_guided_decoding_config from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager from tensorrt_llm._torch.speculative import get_spec_drafter @@ -35,7 +36,6 @@ from tensorrt_llm.llmapi.llm_args import ( ContextChunkingPolicy, LoadFormat, SamplerType, - SpeculativeConfig, TorchLlmArgs, ) from tensorrt_llm.llmapi.tokenizer import TokenizerBase @@ -46,7 +46,12 @@ from ....mapping import Mapping from ...distributed import MPIDist from ...pyexecutor.model_engine import ModelEngine, PyTorchModelEngine from ...pyexecutor.py_executor import PyExecutor -from ...pyexecutor.resource_manager import KVCacheManager, ResourceManager, ResourceManagerType +from ...pyexecutor.resource_manager import ( + BaseResourceManager, + KVCacheManager, + ResourceManager, + ResourceManagerType, +) from ...pyexecutor.sampler import TorchSampler, TRTLLMSampler from ...pyexecutor.scheduler import ( BindCapacityScheduler, @@ -203,6 +208,104 @@ def create_draft_kv_cache_manager_maybe( ) +def _round_up_to_closest(batch_sizes: List[int], bs: int) -> Optional[int]: + """Return closest batch size larger or equal to bs.""" + if bs > max(batch_sizes, default=0): + return None + return min(batch_sizes, key=lambda x: (x < bs, abs(x - bs)), default=None) + + +def _generate_dummy_request( + resource_manager: ResourceManager, request_id: int, **request_kwargs +) -> Optional[LlmRequest]: + # get resource managers we want + kv_cache_manager: KVCacheManager = resource_manager.get_resource_manager( + ResourceManagerType.KV_CACHE_MANAGER + ) + slot_manager: SeqSlotManager = resource_manager.get_resource_manager( + ResourceManagerType.SEQ_SLOT_MANAGER + ) + spec_res_mgr: Optional[BaseResourceManager] = resource_manager.get_resource_manager( + ResourceManagerType.SPEC_RESOURCE_MANAGER + ) + + # check if we have a free slot available and free page available + if not slot_manager.slot_manager.free_slots or kv_cache_manager.get_num_free_blocks() == 0: + return None + + # generate a dummy request + dummy_request = kv_cache_manager.add_dummy_requests([request_id], **request_kwargs)[0] + dummy_request.is_cuda_graph_dummy = True + + # add to spec resource manager + if spec_res_mgr: + spec_res_mgr.add_dummy_requests([request_id]) + + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9883 clean up this hack + dummy_request.seq_slot = slot_manager.get_max_resource_count() + dummy_request.py_seq_slot = dummy_request.seq_slot + + return dummy_request + + +def maybe_pad_for_cuda_graph(func): + def wrapper( + self: "ADEngine", + scheduled_requests: ScheduledRequests, + resource_manager: ResourceManager, + *args, + **kwargs, + ): + def _call_func(): + return func(self, scheduled_requests, resource_manager, *args, **kwargs) + + # check if we use cuda graph and we can run it + if not (self.cuda_graph_used and scheduled_requests.can_run_cuda_graph): + return _call_func() + + # generate a persistent dummy request right away to ensure we can reserve the necessary + # resources (kv page and slot) + if self.padding_dummy_request is None: + self.padding_dummy_request = _generate_dummy_request( + resource_manager, + request_id=CUDA_GRAPH_DUMMY_REQUEST_ID, + is_gen=True, + max_num_draft_tokens=self.max_total_draft_tokens, + use_mrope=False, + max_beam_width=self.max_beam_width, + ) + + # check closest cuda graph batch size + closest_cg_bs = _round_up_to_closest( + self.cuda_graph_batch_sizes, scheduled_requests.batch_size + ) + + # check if we need to pad + num_padding = closest_cg_bs - scheduled_requests.batch_size + + if num_padding <= 0: + return _call_func() + + # check if we have a dummy request to use + if self.padding_dummy_request is None: + ad_logger.error("No CUDA graph padding possible due to missing dummy request.") + return _call_func() + + # pad the scheduled requests with the dummy request + scheduled_requests.generation_requests.extend([self.padding_dummy_request] * num_padding) + + ret = _call_func() + + # truncate requests to remove the dummy requests we added + scheduled_requests.generation_requests = scheduled_requests.generation_requests[ + :-num_padding + ] + + return ret + + return wrapper + + class ADEngine(ModelEngine): """The AutoDeploy Engine (ADEngine) is the main engine interface to execute AutoDeploy models. @@ -223,7 +326,6 @@ class ADEngine(ModelEngine): max_seq_len = ad_config.max_seq_len attn_page_size = ad_config.attn_page_size max_num_tokens = ad_config.max_num_tokens - max_beam_width = ad_config.max_beam_width # update device to contain the current default device if it's in cuda device = torch.device(ad_config.device) @@ -240,7 +342,6 @@ class ADEngine(ModelEngine): page_size=attn_page_size, max_num_tokens=max_num_tokens, vocab_size_padded=factory.vocab_size_padded, - chunk_size=factory.chunk_size, ) reporting_info = ReportingInfo( print_log=False, @@ -258,10 +359,8 @@ class ADEngine(ModelEngine): build_and_optimize, seq_info, device, - max_beam_width, - ad_config.speculative_config, - ad_config.disable_overlap_scheduler, - reporting_info, + ad_config=ad_config, + reporting_info=reporting_info, ) @torch.inference_mode() @@ -270,9 +369,7 @@ class ADEngine(ModelEngine): get_inference_model: GetInferenceModel, seq_info: SequenceInfo, device: DeviceLikeType, - max_beam_width: int = 1, - spec_config: Optional[SpeculativeConfig] = None, - disable_overlap_scheduler: bool = False, + ad_config: Optional[LlmArgs] = None, reporting_info: ReportingInfo = ReportingInfo(), ) -> None: """Initialize the engine with model and sequence information.""" @@ -293,11 +390,22 @@ class ADEngine(ModelEngine): self.iter_states = {} # NOTE (lucaslie): not a declared base member in the base class; required by PyExecutor... - self.max_beam_width = max_beam_width self.enable_attention_dp = False - self._disable_overlap_scheduler = disable_overlap_scheduler - self.spec_config = spec_config + if ad_config is not None: + self.max_beam_width = ad_config.max_beam_width + self.spec_config = ad_config.speculative_config + self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler + else: + self.max_beam_width = 1 + self.spec_config = None + self._disable_overlap_scheduler = False + + # check for max total draft tokens + if self.spec_config is not None: + self.max_total_draft_tokens = self.spec_config.max_total_draft_tokens + else: + self.max_total_draft_tokens = 0 # TODO(govind): Enable overlap scheduler for speculation. assert self.spec_config is None or self._disable_overlap_scheduler, ( @@ -319,6 +427,18 @@ class ADEngine(ModelEngine): # start fresh with fixed seed torch.manual_seed(42) + # check cuda graph padding... + # TODO: better mechanism to retrieve this information when we refactor LlmArgs + if ad_config is None: + self.cuda_graph_used = False + self.cuda_graph_batch_sizes = [] + else: + self.cuda_graph_used = ad_config.is_cuda_graph_enabled() + self.cuda_graph_batch_sizes = ad_config.cuda_graph_batch_sizes + + # keep a reference for one dummy request around + self.padding_dummy_request: Optional[LlmRequest] = None + @nvtx_range("ad_prepare_inputs") def _prepare_inputs( self, @@ -343,15 +463,25 @@ class ADEngine(ModelEngine): gen_requests = extend_requests + generation_requests # info to be extracted input_ids: List[List[int]] = [] + position_ids: List[List[int]] = [] input_pos: List[int] = [] + seq_len: List[int] = [] + cu_seqlen: List[int] = [0] last_logit_only: List[bool] = [] - page_assignments: List[List[int]] = [] + cache_loc: List[int] = [] + pages_per_seq: List[int] = [] + cu_num_pages: List[int] = [0] + seq_len_with_cache: List[int] = [] + last_page_len: List[int] = [] slot_idx: List[int] = [] + use_initial_states: List[bool] = [] # gather indices are used to gather tokens in new_tokens into input_ids - flat_gather_indices: List[List[int]] = [] + flat_gather_indices: List[int] = [] + mask_scatter_indices: List[int] = [] extra_args: Dict[str, List[torch.Tensor]] = defaultdict(list) + page_size = self.cache_seq_interface.info.page_size dummy_token = -1 num_ctx_requests = len(context_requests) num_ctx_tokens = 0 @@ -371,16 +501,26 @@ class ADEngine(ModelEngine): input_ids.append(prompt_tokens) input_pos.append(begin_compute) + seq_len.append(len(input_ids[-1])) + cu_seqlen.append(cu_seqlen[-1] + seq_len[-1]) + request.py_batch_idx = request.seq_slot last_logit_only.append(True) # get cache indices and truncate the number of blocks according to end_compute cache_indices = kv_cache_manager.get_cache_indices(request) num_active_blocks = kv_cache_manager.get_num_kv_blocks(end_compute) - page_assignments.append(cache_indices[:num_active_blocks]) + cache_loc.extend(cache_indices[:num_active_blocks]) + pages_per_seq.append(num_active_blocks) + cu_num_pages.append(cu_num_pages[-1] + pages_per_seq[-1]) + seq_len_with_cache.append(input_pos[-1] + seq_len[-1]) + last_page_len.append((seq_len_with_cache[-1] - 1) % page_size + 1) + + position_ids.append(list(range(input_pos[-1], seq_len_with_cache[-1]))) # store seq slot idx slot_idx.append(request.seq_slot) + use_initial_states.append(input_pos[-1] > 0) # store extra arguments if request.py_multimodal_data is not None: @@ -414,7 +554,7 @@ class ADEngine(ModelEngine): else: return request.max_beam_num_tokens - 1 - def _build_input_ids(request) -> Tuple[List[int], List[int]]: + def _build_input_ids(request) -> Tuple[List[int], List[int], bool]: """Build input_ids and gather indices for a request. Gather indices are used to gather tokens from new_tokens into input_ids when we run the overlap scheduler. """ @@ -446,11 +586,11 @@ class ADEngine(ModelEngine): gather_indices = [request.py_batch_idx] input_ids = [dummy_token] - return input_ids, gather_indices + return input_ids, gather_indices, use_overlap for request in gen_requests: num_tokens_seen = _compute_num_tokens_seen(request) - input_ids_for_request, gather_indices_to_append = _build_input_ids(request) + input_ids_for_request, gather_indices_to_append, use_overlap = _build_input_ids(request) input_ids.append(input_ids_for_request) input_pos.append(num_tokens_seen) @@ -459,27 +599,46 @@ class ADEngine(ModelEngine): num_generation_tokens += 1 + get_draft_token_length(request) request.py_batch_idx = request.seq_slot slot_idx.append(request.seq_slot) + use_initial_states.append(input_pos[-1] > 0) last_logit_only.append(False) + seq_len.append(len(input_ids[-1])) + cu_seqlen.append(cu_seqlen[-1] + seq_len[-1]) + + if use_overlap: + mask_scatter_indices.extend(list(range(cu_seqlen[-2], cu_seqlen[-1]))) + # get cache indices cache_indices = kv_cache_manager.get_cache_indices(request) - page_assignments.append(cache_indices) + cache_loc.extend(cache_indices) + pages_per_seq.append(len(cache_indices)) + cu_num_pages.append(cu_num_pages[-1] + pages_per_seq[-1]) + seq_len_with_cache.append(input_pos[-1] + seq_len[-1]) + last_page_len.append((seq_len_with_cache[-1] - 1) % page_size + 1) + + position_ids.append(list(range(input_pos[-1], seq_len_with_cache[-1]))) # update the sequence info object now self.cache_seq_interface.info.nest_sequences( input_ids, + position_ids=position_ids, + seq_len=seq_len, input_pos=input_pos, - page_assignments=page_assignments, + cu_seqlen=cu_seqlen, + cache_loc=cache_loc, + pages_per_seq=pages_per_seq, + cu_num_pages=cu_num_pages, + seq_len_with_cache=seq_len_with_cache, + last_page_len=last_page_len, slot_idx=slot_idx, + use_initial_states=use_initial_states, + _gather_idx=None if new_tokens is None else flat_gather_indices, + _mask_scatter_indices=None if new_tokens is None else mask_scatter_indices, **extra_args, ) # scatter the new tokens into the input_ids tensor if provided if new_tokens is not None: - self.cache_seq_interface.info.rescatter_input_ids( - ungathered_input_ids=new_tokens.flatten(), # ensure it's flattened - gather_idx=flat_gather_indices, - scatter_ref=dummy_token, - ) + self.cache_seq_interface.info.rescatter_input_ids(new_tokens.flatten()) self.iter_states["num_ctx_requests"] = num_ctx_requests self.iter_states["num_ctx_tokens"] = num_ctx_tokens @@ -503,6 +662,7 @@ class ADEngine(ModelEngine): return self.cache_seq_interface.info.max_batch_size @torch.inference_mode() + @maybe_pad_for_cuda_graph def forward( self, scheduled_requests: ScheduledRequests, diff --git a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py index d0b93c2bd1..ed6051497b 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py @@ -110,10 +110,15 @@ class DemoEngine(ADEngine): extra_args[k].append(v) sequence_info.reset() + page_assignments = self._assign_pages(total_lens) + cache_loc, pages_per_seq = sequence_info._get_cache_locations_and_pages_per_sequence( + page_assignments + ) sequence_info.nest_sequences( input_ids=input_ids, input_pos=0, - page_assignments=self._assign_pages(total_lens), + cache_loc=cache_loc, + pages_per_seq=pages_per_seq, slot_idx=list(range(len(input_ids))), **extra_args, ) @@ -142,10 +147,15 @@ class DemoEngine(ADEngine): seq_lens_current = sequence_info.seq_len input_pos_next = [ip + sl for ip, sl in zip(input_pos_next, seq_lens_current)] total_lens_next = [ip + len(t_ids) for ip, t_ids in zip(input_pos_next, token_ids)] + page_assignments = self._assign_pages(total_lens_next) + cache_loc, pages_per_seq = sequence_info._get_cache_locations_and_pages_per_sequence( + page_assignments + ) sequence_info.nest_sequences( token_ids, input_pos=input_pos_next, - page_assignments=self._assign_pages(total_lens_next), + cache_loc=cache_loc, + pages_per_seq=pages_per_seq, ) # nest new tokens and run stop check diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py b/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py index b1689abeba..376abc8902 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py @@ -3,7 +3,7 @@ from typing import List, Literal, Optional, Tuple, Type import torch.nn as nn from pydantic import Field -from ...compile import CompileBackendRegistry +from ...compile import ArgsKwargs, CompileBackendRegistry from ...models.factory import ModelFactory from ...shim.interface import CachedSequenceInterface from ..interface import ( @@ -46,19 +46,19 @@ class CompileModel(BaseTransform): factory: ModelFactory, shared_config: SharedConfig, ) -> Tuple[nn.Module, TransformInfo]: - cm.info.set_generate_only_batch() - - compiler_cls = CompileBackendRegistry.get(self.config.backend) - mod_compiled = compiler_cls( - mod, - args=(), - kwargs=cm.named_args, - max_batch_size=cm.info.max_batch_size, - **self.config.model_dump(), - ).compile() - cm.info.reset() + def _get_args_kwargs(bs: int) -> ArgsKwargs: + cm.info.set_generate_only_batch(bs) + return (), cm.named_args + + compiler_backend = CompileBackendRegistry.get(self.config.backend)( + mod, + get_args_kwargs_for_compile=_get_args_kwargs, + **self.config.model_dump(), + ) + mod_compiled = compiler_backend.compile() + # store info object about the transform info = TransformInfo(skipped=False, num_matches=1, is_clean=True, has_valid_shapes=True) diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py index ecf42d0b23..113ae27b80 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py @@ -13,13 +13,14 @@ from ...custom_ops.attention_interface import ( AttentionRegistry, CacheConfig, Constant, + PrepareMetadataCallable, ) from ...distributed.common import all_gather_object, get_world_size from ...distributed.common import is_initialized as is_distributed_initialized from ...models.factory import ModelFactory from ...shim.interface import CachedSequenceInterface from ...utils._graph import add_graph_input -from ...utils.node_utils import get_all_input_output_nodes, is_op +from ...utils.node_utils import is_op from ..interface import ( BaseTransform, SharedConfig, @@ -29,44 +30,6 @@ from ..interface import ( ) -@TransformRegistry.register("update_in_out_nodes") -class UpdateInOutNodes(BaseTransform): - """Modify the graph module by adding new input nodes. - - The new input nodes correspond to the extra arguments needed for cached and flattened attention. - - Args: - egm: The graph module to analyze and modify. - cm: Cached sequence interface containing extra argument information. - """ - - def _apply( - self, - gm: GraphModule, - cm: CachedSequenceInterface, - factory: ModelFactory, - shared_config: SharedConfig, - ) -> Tuple[GraphModule, TransformInfo]: - # loop through nodes to get input, output, and get_attr nodes - input_nodes, output_nodes = get_all_input_output_nodes(gm.graph) - - # NOTE: for now, we wanna make sure we *only* return the final output and no hidden states. - # Later on, we can revisit how to support returning hidden states. - assert len(output_nodes) == 1, "Expected exactly one output node!" - assert len(output_nodes[0].all_input_nodes) == 1, ( - "Expected to only return final tensor output!" - ) - - # Activate and add extra argument nodes - new_args = cm.info.switch_to_cached_attn_inputs() - for name in new_args: - input_nodes.append(add_graph_input(gm, name)) - - info = TransformInfo(skipped=False, num_matches=1, is_clean=False, has_valid_shapes=False) - - return gm, info - - class InsertCachedAttentionConfig(TransformConfig): """Configuration for the insert cached attention transform.""" @@ -91,26 +54,70 @@ class InsertCachedAttention(BaseTransform): def attn_descriptor(self) -> Type[AttentionDescriptor]: return AttentionRegistry.get(self.config.backend) - def _process_get_metadata( - self, gm: GraphModule, m_args: List[str], const_args: List[Constant] + def _add_or_retrieve_input( + self, gm: GraphModule, cm: CachedSequenceInterface, name: str + ) -> Node: + """Add or retrieve an input node from the graph.""" + input_nodes = gm.graph.find_nodes(op="placeholder", target=name) + if len(input_nodes) == 0: + cm.info.activate_arg(name) + return add_graph_input(gm, name) + elif len(input_nodes) == 1: + return input_nodes[0] + else: + raise ValueError(f"Expected exactly one input node for {name=}, got {input_nodes=}") + + def _process_metadata_std(self, gm: GraphModule, cm: CachedSequenceInterface) -> List[Node]: + """Process the standard metadata nodes.""" + return [ + self._add_or_retrieve_input(gm, cm, arg_name) + for arg_name in self.attn_descriptor.get_standard_metadata_args() + ] + + def _insert_extra_metadata_op( + self, + gm: GraphModule, + prep_meta_op: PrepareMetadataCallable, + inputs_for_prep_meta: List[Node], + const_args: List[Constant], + num_meta_out: int, + ) -> List[Node]: + # add the computed extra metadata nodes to the graph and add to meta for cached attention op + meta_nodes_extra = [] + node_last_input = gm.graph.find_nodes(op="placeholder", sort=True)[-1] + with gm.graph.inserting_before(node_last_input.next): + ret_node = gm.graph.call_function( + prep_meta_op, args=(*inputs_for_prep_meta, *const_args) + ) + for idx in range(num_meta_out): + meta_extra_node = gm.graph.call_function(operator.getitem, args=(ret_node, idx)) + meta_nodes_extra.append(meta_extra_node) + + return meta_nodes_extra + + def _process_metadata_extra( + self, gm: GraphModule, cm: CachedSequenceInterface, any_source_attn_node: Node ) -> List[Node]: """Process the get_metadata function into an op and return node references.""" - # retrieve input nodes - input_nodes, _ = get_all_input_output_nodes(gm.graph) - input_nodes_mapping = {n.target: n for n in input_nodes} + # get the metadata op for extra metadata and number of return values + prep_meta_op, num_meta_out, const_args = ( + self.attn_descriptor.get_prepare_extra_metadata_info(any_source_attn_node) + ) - # filtered and sorted for SequenceInfo arguments + constants (input_ids, position_ids, etc.) - inputs_from_info = [input_nodes_mapping[k] for k in m_args] + # if there is no extra metadata op or no return values, we can return early + if prep_meta_op is None or num_meta_out == 0: + return [] - # insert metadata computation and extract each argument as a node - get_metadata, num_metadata = self.attn_descriptor.get_prepare_metadata_op() - with gm.graph.inserting_before(input_nodes[-1].next): - ret_node = gm.graph.call_function(get_metadata, args=(*inputs_from_info, *const_args)) - metadata_nodes = [ - gm.graph.call_function(operator.getitem, args=(ret_node, idx)) - for idx in range(num_metadata) - ] - return metadata_nodes + # check what inputs the extra metadata op expects + inputs_for_prep_meta = [ + self._add_or_retrieve_input(gm, cm, arg.name) + for arg in prep_meta_op._schema.arguments + if arg.name in cm.info.available_args + ] + + return self._insert_extra_metadata_op( + gm, prep_meta_op, inputs_for_prep_meta, const_args, num_meta_out + ) def _process_cache_node(self, gm: GraphModule, cache_name: str) -> Node: """Process the cache nodes by inserting a cached attention replacement op.""" @@ -121,7 +128,8 @@ class InsertCachedAttention(BaseTransform): gm: GraphModule, attn_node: Node, qkv_nodes: List[Node], - meta_nodes: List[Node], + meta_nodes_std: List[Node], + meta_nodes_extra: List[Node], cache_nodes: List[Node], buffer_nodes: List[Node], constants: List[Constant], @@ -130,7 +138,14 @@ class InsertCachedAttention(BaseTransform): with gm.graph.inserting_before(attn_node): cached_attn_node = gm.graph.call_function( self.attn_descriptor.get_cached_attention_op(), - args=(*qkv_nodes, *meta_nodes, *cache_nodes, *buffer_nodes, *constants), + args=( + *qkv_nodes, + *meta_nodes_std, + *meta_nodes_extra, + *cache_nodes, + *buffer_nodes, + *constants, + ), ) attn_node.replace_all_uses_with(cached_attn_node) gm.graph.erase_node(attn_node) @@ -165,10 +180,11 @@ class InsertCachedAttention(BaseTransform): if cm.info.is_paged: assert attn_descriptor.is_paged(), "Paged sequence info requires paged attention op." + # get standard metadata nodes for all source attention nodes + meta_nodes_std = self._process_metadata_std(gm, cm) + # insert metadata computation and extract each argument as a node - metadata_nodes = self._process_get_metadata( - gm, cm.info.args_for_prepare_metadata, cm.info.const_args_for_prepare_metadata - ) + meta_nodes_extra = self._process_metadata_extra(gm, cm, source_attn_nodes[0]) buffer_in_lookup: Dict[str, Node] = {} @@ -201,7 +217,14 @@ class InsertCachedAttention(BaseTransform): # insert cached attention replacement op self._insert_cached_attn_node( - gm, attn_node, qkv, metadata_nodes, cache_in_nodes, buffer_in_nodes, constants + gm, + attn_node, + qkv, + meta_nodes_std, + meta_nodes_extra, + cache_in_nodes, + buffer_in_nodes, + constants, ) num_cached_attn_replacements += 1 diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache_transformers.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache_transformers.py index aaa12082ce..1f34445647 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache_transformers.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache_transformers.py @@ -11,7 +11,7 @@ from torch.fx import Graph, GraphModule, Node from transformers.configuration_utils import PretrainedConfig from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from ...custom_ops.attention_interface import AttentionDescriptor, Constant +from ...custom_ops.attention_interface import AttentionDescriptor, Constant, PrepareMetadataCallable from ...export.library.unified_attn import HF_ATTN_KWARGS_MAPPING from ...models.factory import ModelFactory from ...shim.interface import CachedSequenceInterface @@ -205,18 +205,30 @@ def forward_with_prepare_metadata(mod: nn.Module, **cm_kwargs): class HFReplaceCachedAttn(InsertCachedAttention): """Replace cached attention for the factory model, update inputs and outputs, and patch the gm forward.""" - def _process_get_metadata( - self, gm: GraphModule, m_args: List[str], const_args: List[Constant] + def _add_or_retrieve_input( + self, gm: GraphModule, cm: CachedSequenceInterface, name: str + ) -> Node: + """When this is needed, we just activate the argument and return the name.""" + cm.info.activate_arg(name) + return name + + def _insert_extra_metadata_op( + self, + gm: GraphModule, + prep_meta_op: PrepareMetadataCallable, + inputs_for_prep_meta: List[Node], + const_args: List[Constant], + num_meta_out: int, ) -> List[Node]: - """Store get metadata function as reference and simply return.""" - get_metadata, num_ret_metadata = self.attn_descriptor.get_prepare_metadata_op() + """Store prepare metadata function as reference and simply return.""" + ret_names = [f"metadata_{i}" for i in range(num_meta_out)] gm._prepare_metadata_info = { - "get_metadata": get_metadata, - "arg_names": m_args, + "get_metadata": prep_meta_op, + "arg_names": inputs_for_prep_meta, "const_args": const_args, - "return_names": [f"metadata_{i}" for i in range(num_ret_metadata)], + "return_names": ret_names, } - return gm._prepare_metadata_info["return_names"] # we don't need actual nodes... + return ret_names def _process_cache_node(self, gm: GraphModule, cache_name: str) -> Node: """We don't need to actually do anything here, just return the cache name.""" @@ -227,14 +239,20 @@ class HFReplaceCachedAttn(InsertCachedAttention): gm: GraphModule, attn_node: Node, qkv_nodes: List[Node], - meta_nodes: List[Node], + meta_nodes_std: List[Node], + meta_nodes_extra: List[Node], cache_nodes: List[Node], buffer_nodes: List[Node], constants: List[Constant], ): """Here we now need to actually do the correct mapping of the cached attn nodes.""" # store reference to metadata, caches, buffers, and constants for this attn node - attn_node.meta["metadata_cache_buffer_keys"] = (*meta_nodes, *cache_nodes, *buffer_nodes) + attn_node.meta["metadata_cache_buffer_keys"] = ( + *meta_nodes_std, + *meta_nodes_extra, + *cache_nodes, + *buffer_nodes, + ) attn_node.meta["constants"] = constants def _apply_to_full_model( @@ -244,9 +262,6 @@ class HFReplaceCachedAttn(InsertCachedAttention): factory: ModelFactory, shared_config: SharedConfig, ) -> Tuple[nn.Module, TransformInfo]: - # switch to cached attn inputs from now - cm.info.switch_to_cached_attn_inputs() - # run actual insert cached attn transform with fake graph module mod._gm, info = super()._apply(mod._gm, cm, factory, shared_config) diff --git a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py index 5e71cd66c6..fa91a79257 100644 --- a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py +++ b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py @@ -6,7 +6,7 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union import torch from torch._ops import OpOverload, OpOverloadPacket -from torch.fx import Graph, GraphModule, Node +from torch.fx import GraphModule, Node from .logger import ad_logger @@ -348,12 +348,6 @@ def is_dist_op(node: Node) -> bool: return is_op(node, dist_ops) -def get_all_input_output_nodes(graph: Graph) -> Tuple[List[Node], List[Node]]: - input_nodes: List[Node] = graph.find_nodes(op="placeholder") - output_nodes: List[Node] = graph.find_nodes(op="output") - return (input_nodes, output_nodes) - - def get_user_if_pattern_match(node, ops, numusers, user_idx: int = 0): """Get a user from a node if the node matches a given op set and num of users.""" if node is None: diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py index 13e8d4d004..c4a93f2011 100644 --- a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py +++ b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py @@ -1,5 +1,5 @@ import copy -from typing import Any, Callable, Dict, List, Optional, Sequence +from typing import Callable, Dict, List, Optional import numpy as np import torch @@ -8,7 +8,6 @@ from _torch_test_utils import all_close, reset_parameters from torch.export import export from torch.fx import GraphModule -from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import SequenceInfo from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.models.factory import ( FullModelExportInfo, @@ -45,47 +44,6 @@ class FakeFactory(ModelFactory): return [FullModelExportInfo()] -class SequenceEmbeddingInfo(SequenceInfo): - """A sequence info object for testing that replaces the input_ids with an embedding tensor. - - This is useful to run tests without the tokenizer in the loop. - """ - - def _add_hidden_dim(self, input_ids: Sequence[Sequence[Any]]) -> torch.Tensor: - return torch.rand( - *input_ids.shape, - self.hidden_size, - device=self.device, - dtype=self.dtype, - ) - - def __init__(self, *args, hidden_size: int, dtype: torch.dtype, **kwargs): - self._initialized = False - super().__init__(*args, **kwargs) - - # overwrite input_ids with an embedding tensor and run reset again - self.hidden_size = hidden_size - self.dtype = dtype - self._args_device["input_ids"] = self._add_hidden_dim(self._args_device["input_ids"]) - self._args_host["input_ids"] = self._args_device["input_ids"].cpu() - self._initialized = True - self.reset() - - def nest_sequences(self, input_ids: Sequence[Sequence[Any]], *args, **kwargs) -> None: - # convert input_ids to an embedding tensor if needed - if not (isinstance(input_ids, torch.Tensor) and input_ids.ndim == 3) and self._initialized: - # first convert to a list of tensors - input_embeds = [ - torch.tensor(ids, device=self.device, dtype=self.dtype) for ids in input_ids - ] - # then add the hidden dimension to every tensor - input_embeds = [self._add_hidden_dim(ids) for ids in input_embeds] - else: - input_embeds = input_ids - - super().nest_sequences(input_embeds, *args, **kwargs) - - def count_parameters(model: torch.nn.Module): for n, p in model.named_parameters(): print(n, p.shape) diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/torch_attention_reference.py b/tests/unittest/_torch/auto_deploy/_utils_test/torch_attention_reference.py index 37d597dbfe..8ee4039284 100644 --- a/tests/unittest/_torch/auto_deploy/_utils_test/torch_attention_reference.py +++ b/tests/unittest/_torch/auto_deploy/_utils_test/torch_attention_reference.py @@ -40,6 +40,16 @@ class TorchAttentionReference: 0, batch_size * seq_len, seq_len, device=q.device, dtype=torch.int32 ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For context phase (seq_len > 1): [batch_size, batch_size * seq_len, 0] + # For generate phase (seq_len == 1): [0, 0, batch_size] + if seq_len == 1: + batch_info = torch.tensor([0, 0, batch_size], device=q.device, dtype=torch.int32) + else: + batch_info = torch.tensor( + [batch_size, batch_size * seq_len, 0], device=q.device, dtype=torch.int32 + ) + # Flatten inputs to [1, total_seq_len, ...] format q_flat = q.view(1, batch_size * seq_len, -1) k_flat = k.view(1, batch_size * seq_len, -1) @@ -50,6 +60,7 @@ class TorchAttentionReference: q_flat, k_flat, v_flat, + batch_info, seq_len_tensor, input_positions, cache_loc, @@ -70,14 +81,34 @@ class TorchAttentionReference: @staticmethod def flattened_mha_with_cache( - q, k, v, seq_len, input_positions, cache_loc, seq_start, k_cache, v_cache, scale=None + q, + k, + v, + batch_info, + seq_len, + input_positions, + cache_loc, + seq_start, + k_cache, + v_cache, + scale=None, ): """Reference implementation following triton flattened MHA pattern. This function directly calls the torch backend implementation via custom op registry. """ return torch.ops.auto_deploy.torch_cached_attention_with_cache( - q, k, v, seq_len, input_positions, cache_loc, seq_start, k_cache, v_cache, scale + q, + k, + v, + batch_info, + seq_len, + input_positions, + cache_loc, + seq_start, + k_cache, + v_cache, + scale, ) @staticmethod @@ -113,11 +144,15 @@ class TorchAttentionReference: k_flat = k_new.view(1, batch_size, -1) v_flat = v_new.view(1, batch_size, -1) + # Create batch_info for decode phase: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor([0, 0, batch_size], device=q.device, dtype=torch.int32) + # Call torch backend via custom op registry output_flat = torch.ops.auto_deploy.torch_cached_attention_with_cache( q_flat, k_flat, v_flat, + batch_info, seq_len, input_positions, cache_loc, @@ -135,6 +170,7 @@ class TorchAttentionReference: q, k, v, + batch_info, seq_len, input_positions, cache_loc, @@ -153,6 +189,7 @@ class TorchAttentionReference: q, k, v, + batch_info, seq_len, input_positions, cache_loc, diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py index 0c799648e5..bc2769d617 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py @@ -14,7 +14,9 @@ from build_and_run_ad import ExperimentConfig, main { "transforms": { "insert_cached_attention": {"backend": "flashinfer"}, - "compile_model": {"backend": "torch-opt"}, + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9878 + # "compile_model": {"backend": "torch-opt"}, + "compile_model": {"backend": "torch-cudagraph"}, }, }, ), diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_captured_graph.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_captured_graph.py index 3d456d405c..c300dcd8e4 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_captured_graph.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_captured_graph.py @@ -11,6 +11,7 @@ from tensorrt_llm._torch.auto_deploy.compile.backends.torch_cudagraph import ( _args_kwargs_flatten_spec, ) from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.shim.ad_executor import _round_up_to_closest class ModelWithMultipleInputs(torch.nn.Module): @@ -44,7 +45,7 @@ class ModelWithMultipleInputs(torch.nn.Module): ], ) def test_round_up_to_closest(lst, value, expected): - assert CapturedGraph.round_up_to_closest(lst, value) == expected + assert _round_up_to_closest(lst, value) == expected @pytest.mark.parametrize("num_inputs", [1, 2, 3]) @@ -100,13 +101,19 @@ def test_cudagraph_capture_replay( compiled_model = CapturedGraph( graph_module, - cuda_graph_batch_sizes=[batch_size], num_batched_inputs=num_inputs, ) + # Create a get_args_kwargs function for capture_graph + def get_args_kwargs(bs): + if model_type == "llm": + return tuple(x[:bs] for x in input_data[:num_inputs]), {} + else: # vit + return tuple(x[:bs] for x in input_data[:num_inputs]), {} + with torch.inference_mode(): - # Capture graph with all inputs - compiled_model.capture_graph(*args) + # Capture graph with batch sizes + compiled_model.capture_graph(get_args_kwargs, [batch_size]) # Ensure the graph is stored for the combined shape of all inputs assert combined_shape in compiled_model.cudagraphs, ( diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_compiler.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_compiler.py index 56da17ae75..0f911e56a7 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_compiler.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_compiler.py @@ -52,14 +52,23 @@ def test_compile_and_capture(model_type, model_cls, input_shape, output_shape_fn dynamic_shapes = generate_dynamic_shapes(batch_size, seq_shape[0]) graph_module = torch_export_to_gm(mod, args=(sample_input,), dynamic_shapes=dynamic_shapes) + # Create a get_args_kwargs function for backends that need it + def get_args_kwargs(bs): + return (sample_input[:bs],), {} + with torch.inference_mode(): compiler_cls = CompileBackendRegistry.get(backend_cls) - compiled_model = compiler_cls( - graph_module, - args=(sample_input,), - num_batched_inputs=1, - max_batch_size=batch_size, - ).compile() + + # Add get_args_kwargs_for_compile for cudagraph-based backends + compiler_kwargs = { + "args": (sample_input,), + "num_batched_inputs": 1, + "max_batch_size": batch_size, + "get_args_kwargs_for_compile": get_args_kwargs, + "cuda_graph_batch_sizes": [batch_size], + } + + compiled_model = compiler_cls(graph_module, **compiler_kwargs).compile() assert isinstance(compiled_model, Module), "Compiled model is not a valid nn.Module." output = compiled_model(sample_input) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_cuda_graph_batch_sizes.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_cuda_graph_batch_sizes.py index f3b05d90bd..403f0cabb3 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_cuda_graph_batch_sizes.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/compile/test_cuda_graph_batch_sizes.py @@ -74,21 +74,21 @@ class TestCudaGraphBatchSizes: # Request CUDA graph batch sizes that exceed max_batch_size requested_batch_sizes = [1, 4, 8, 16, 32, 64] # 32 and 64 should be clamped to 16 + # Create a get_args_kwargs function for the compiler + def get_args_kwargs(bs): + return (data["input_tensor"][:bs],), {} + compiler = TorchCudagraphCompiler( model=data["gm"], args=(data["input_tensor"],), max_batch_size=max_batch_size, cuda_graph_batch_sizes=requested_batch_sizes, + get_args_kwargs_for_compile=get_args_kwargs, ) - # Check that batch sizes are clamped to max_batch_size - expected_clamped = [1, 4, 8, 16] # 32 and 64 should be clamped to 16, then deduped - assert compiler.cuda_graph_batch_sizes == sorted(expected_clamped, reverse=True) - - # Verify that oversized batch sizes were filtered out - assert 32 not in compiler.cuda_graph_batch_sizes - assert 64 not in compiler.cuda_graph_batch_sizes - assert max(compiler.cuda_graph_batch_sizes) == max_batch_size + # The compiler stores batch sizes as-is; clamping happens during capture + # Filter batch sizes to max_batch_size for comparison + assert compiler.cuda_graph_batch_sizes == requested_batch_sizes def test_cuda_graph_batch_sizes_no_clamping_needed(self, simple_model_and_inputs): """Test that cuda_graph_batch_sizes are not modified when they're within limits.""" @@ -97,50 +97,64 @@ class TestCudaGraphBatchSizes: # Request CUDA graph batch sizes that are all within max_batch_size requested_batch_sizes = [1, 4, 8, 12] + # Create a get_args_kwargs function for the compiler + def get_args_kwargs(bs): + return (data["input_tensor"][:bs],), {} + compiler = TorchCudagraphCompiler( model=data["gm"], args=(data["input_tensor"],), cuda_graph_batch_sizes=requested_batch_sizes, + get_args_kwargs_for_compile=get_args_kwargs, ) - # Check that batch sizes are preserved - assert compiler.cuda_graph_batch_sizes == sorted(requested_batch_sizes, reverse=True) + # Check that batch sizes are preserved as provided + assert compiler.cuda_graph_batch_sizes == requested_batch_sizes # Verify all requested sizes are within max_batch_size max_batch_size = data["batch_size"] assert all(bs <= max_batch_size for bs in compiler.cuda_graph_batch_sizes) def test_heuristic_cuda_graph_batch_sizes(self, simple_model_and_inputs): - """Test that heuristic batch sizes are generated when none are provided.""" + """Test that empty batch sizes list is stored when none are provided.""" data = simple_model_and_inputs max_batch_size = data["batch_size"] # 16 + # Create a get_args_kwargs function for the compiler + def get_args_kwargs(bs): + return (data["input_tensor"][:bs],), {} + compiler = TorchCudagraphCompiler( model=data["gm"], args=(data["input_tensor"],), - max_batch_size=max_batch_size, # No cuda_graph_batch_sizes provided + max_batch_size=max_batch_size, + get_args_kwargs_for_compile=get_args_kwargs, + # No cuda_graph_batch_sizes provided - should default to empty list ) - # Check that heuristic batch sizes were generated - assert len(compiler.cuda_graph_batch_sizes) > 0 - assert max(compiler.cuda_graph_batch_sizes) <= max_batch_size - assert 1 in compiler.cuda_graph_batch_sizes # Should always include 1 - assert max_batch_size in compiler.cuda_graph_batch_sizes # Should include max + # Check that cuda_graph_batch_sizes defaults to empty list + assert compiler.cuda_graph_batch_sizes == [] def test_captured_graph_max_batch_size_consistency(self, simple_model_and_inputs): - """Test that CapturedGraph.max_batch_size equals max(cuda_graph_batch_sizes).""" + """Test that CapturedGraph captures graphs for specified batch sizes.""" data = simple_model_and_inputs cuda_graph_batch_sizes = [1, 4, 8, 12] captured_graph = CapturedGraph( model=data["model"], - cuda_graph_batch_sizes=cuda_graph_batch_sizes, num_batched_inputs=1, ) - assert captured_graph.cuda_graph_max_batch_size == max(cuda_graph_batch_sizes) - assert captured_graph.cuda_graph_batch_sizes == sorted(cuda_graph_batch_sizes, reverse=True) + # Create a get_args_kwargs function + def get_args_kwargs(bs): + return (data["input_tensor"][:bs],), {} + + # Capture graphs for the specified batch sizes + captured_graph.capture_graph(get_args_kwargs, cuda_graph_batch_sizes) + + # Verify graphs were captured for all batch sizes + assert len(captured_graph.cudagraphs) == len(cuda_graph_batch_sizes) def test_forward_fallback_for_oversized_batch(self, simple_model_and_inputs): """Test that forward method falls back to regular execution for oversized batches.""" @@ -150,13 +164,15 @@ class TestCudaGraphBatchSizes: cuda_graph_batch_sizes = [1, 2, 4] captured_graph = CapturedGraph( model=data["model"], - cuda_graph_batch_sizes=cuda_graph_batch_sizes, num_batched_inputs=1, ) - # Capture with small input - small_input = data["input_tensor"] # batch size 16 - captured_graph.capture_graph(small_input) + # Create a get_args_kwargs function + def get_args_kwargs(bs): + return (data["input_tensor"][:bs],), {} + + # Capture graphs + captured_graph.capture_graph(get_args_kwargs, cuda_graph_batch_sizes) # Test forward with oversized input (should fall back) oversized_input = data["input_tensor"] # batch size 16 @@ -184,12 +200,15 @@ class TestCudaGraphBatchSizes: cuda_graph_batch_sizes = [1, 2, 4, 8] captured_graph = CapturedGraph( model=data["model"], - cuda_graph_batch_sizes=cuda_graph_batch_sizes, num_batched_inputs=1, ) - # Capture with full-size input - captured_graph.capture_graph(data["input_tensor"][:8]) # batch size 8 + # Create a get_args_kwargs function + def get_args_kwargs(bs): + return (data["input_tensor"][:bs],), {} + + # Capture graphs for all batch sizes + captured_graph.capture_graph(get_args_kwargs, cuda_graph_batch_sizes) # Test forward with various valid batch sizes for batch_size in [1, 2, 4, 8]: @@ -213,38 +232,34 @@ class TestCudaGraphBatchSizes: assert torch.allclose(output, expected_output, atol=1e-4) @pytest.mark.parametrize( - "requested_sizes,expected_max", + "requested_sizes,expected_sizes", [ - ([1, 4, 8], 8), - ([2, 6, 10, 20], 16), # 20 should be clamped to 16 - ([32, 64, 128], 16), # All should be clamped to 16 - ([], None), # Empty list should use heuristic + ([1, 4, 8], [1, 4, 8]), + ([2, 6, 10, 20], [2, 6, 10, 20]), # Sizes are stored as-is + ([32, 64, 128], [32, 64, 128]), # Sizes are stored as-is + ([], []), # Empty list stays empty ], ) def test_various_batch_size_configurations( - self, simple_model_and_inputs, requested_sizes, expected_max + self, simple_model_and_inputs, requested_sizes, expected_sizes ): """Test various configurations of cuda_graph_batch_sizes.""" data = simple_model_and_inputs max_batch_size = data["batch_size"] # 16 - if requested_sizes: - compiler_kwargs = {"cuda_graph_batch_sizes": requested_sizes} - expected_max = expected_max or max_batch_size - else: - compiler_kwargs = {} - expected_max = max_batch_size + # Create a get_args_kwargs function for the compiler + def get_args_kwargs(bs): + return (data["input_tensor"][: min(bs, max_batch_size)],), {} + + compiler_kwargs = {"cuda_graph_batch_sizes": requested_sizes} if requested_sizes else {} compiler = TorchCudagraphCompiler( model=data["gm"], args=(data["input_tensor"],), max_batch_size=max_batch_size, + get_args_kwargs_for_compile=get_args_kwargs, **compiler_kwargs, ) - # Check that max batch size is as expected - actual_max = max(compiler.cuda_graph_batch_sizes) - assert actual_max == expected_max - - # Check that all sizes are within max_batch_size - assert all(bs <= max_batch_size for bs in compiler.cuda_graph_batch_sizes) + # Check that batch sizes are stored as provided + assert compiler.cuda_graph_batch_sizes == expected_sizes diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py index d89f06b409..15b9eb77c5 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py @@ -125,29 +125,32 @@ def test_flat_gqa_op( k = torch.randn(1, seq_len.sum(), n_kv_heads * D_HEAD, **dtype_kwargs) v = torch.randn(1, seq_len.sum(), n_kv_heads * D_HEAD, **dtype_kwargs) + # create batch_info: [num_prefill, num_prefill_tokens, num_decode] + num_prefill_tokens = seq_len[:num_context].sum() + batch_info = torch.tensor([num_context, num_prefill_tokens, num_generate], **int_kwargs) + # run op output = torch.ops.auto_deploy.triton_attention_flattened_mha_with_cache( # Q, K, V q, k, v, - # METADATA + # STANDARD METADATA + batch_info, seq_len, input_positions, cache_loc, - seq_start, + seq_start, # cu_seqlen # CACHES k_cache, v_cache, - # BUFFERS - # # CONSTANTS scale=None, ) # Use torch backend as clean reference ref_flat = TorchAttentionReference.flattened_mha_with_cache( - q, k, v, seq_len, input_positions, cache_loc, seq_start, k_cache, v_cache + q, k, v, batch_info, seq_len, input_positions, cache_loc, seq_start, k_cache, v_cache ) assert torch.allclose( diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py index aeb5d9dd8a..4e30efdb73 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py @@ -57,9 +57,11 @@ def test_generate_only_with_slot_mapping_cuda(conv_env): ) # Metadata (not used in generate-only op entry, but required by the interface) - seq_len = torch.ones(batch, device=device, dtype=torch.int32) - seq_start = torch.zeros(batch, device=device, dtype=torch.int32) + cu_seqlen = torch.zeros(batch, device=device, dtype=torch.int32) use_initial_states = torch.zeros(batch, device=device, dtype=torch.bool) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For generate-only: num_decode = batch, num_prefill = 0 + batch_info = torch.tensor([0, 0, batch], device=device, dtype=torch.int32) # Snapshot caches for reference before running op (op mutates caches) gathered_before = conv_state_cache.clone().index_select(0, slot_idx) x_ref = x.clone() @@ -69,9 +71,9 @@ def test_generate_only_with_slot_mapping_cuda(conv_env): x, w, b, - # METADATA - seq_len, - seq_start, + # STANDARD METADATA + batch_info, + cu_seqlen, slot_idx, use_initial_states, # CACHES @@ -173,25 +175,3 @@ def test_context_flattened_and_state_writeback_cuda(conv_env): ) assert torch.allclose(y, y_ref.to(y.dtype), atol=conv_env["atol"], rtol=conv_env["rtol"]) - - -def test_prepare_metadata_cuda(conv_env): - device = conv_env["device"] - - b, s = 4, 6 - # input_ids = torch.randint(0, 1000, (b, s), device=device) - position_ids = torch.arange(s, device=device).expand(b, -1) - seq_len = torch.tensor([2, 1, 0, 0], device=device, dtype=torch.int32) - input_pos = torch.tensor([0, 3, 0, 0], device=device, dtype=torch.int32) - cache_loc = torch.arange(b, device=device, dtype=torch.int32) - pages_per_seq = torch.ones(b, device=device, dtype=torch.int32) - slot_idx = torch.tensor([2, 0, 1, 3], device=device, dtype=torch.int32) - page_size = 128 - chunk_size = 128 - out = torch.ops.auto_deploy.cuda_causal_conv_prepare_metadata( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size - ) - assert len(out) == 4 - seq_len_s, seq_start, slot_s, use_initial_states = out - assert seq_len_s.numel() == 2 and slot_s.numel() == 2 - assert torch.all(seq_start == torch.tensor([0, 2], device=device, dtype=seq_start.dtype)) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py index d8dce07ab7..e24364446a 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py @@ -89,16 +89,22 @@ def test_flashinfer_attention_op_context(seq_length, n_heads, batch_size, dtype, ), BATCH_SIZE * SEQ_LEN, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor( + [BATCH_SIZE, BATCH_SIZE * SEQ_LEN, 0], dtype=torch.int32, device=device + ) flashinfer_output = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q, k, v, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -219,16 +225,21 @@ def test_flashinfer_attention_op_decode( ), BATCH_SIZE * SEQ_LEN, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For decode phase: num_decode = BATCH_SIZE, num_prefill = 0 + batch_info = torch.tensor([0, 0, BATCH_SIZE], dtype=torch.int32, device=device) flashinfer_output = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q, k, v, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -338,16 +349,22 @@ def test_flashinfer_attention_context_and_generate( ), BATCH_SIZE * PREFILL_SEQ_LEN, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor( + [BATCH_SIZE, BATCH_SIZE * PREFILL_SEQ_LEN, 0], dtype=torch.int32, device=device + ) flashinfer_output_1 = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q_1, k_1, v_1, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -413,16 +430,20 @@ def test_flashinfer_attention_context_and_generate( ), BATCH_SIZE * 1, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor([0, 0, BATCH_SIZE], dtype=torch.int32, device=device) flashinfer_output_3 = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q_3, k_3, v_3, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -522,16 +543,22 @@ def test_flashinfer_attention_op_context_input_pos(seq, batch_size, n_heads, dty ), BATCH_SIZE * SEQ_LEN, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor( + [BATCH_SIZE, BATCH_SIZE * SEQ_LEN, 0], dtype=torch.int32, device=device + ) flashinfer_output = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q, k, v, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -669,16 +696,22 @@ def test_flashinfer_attention_with_fp8_cache( ), BATCH_SIZE * SEQ_LEN, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor( + [BATCH_SIZE, BATCH_SIZE * SEQ_LEN, 0], dtype=torch.int32, device=device + ) flashinfer_output = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q, k, v, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -766,16 +799,20 @@ def test_flashinfer_attention_with_paged_kvcache(seq_lengths, n_heads, dtype, de ), BATCH_SIZE * SEQ_LEN, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor([BATCH_SIZE, SEQ_LEN, 0], dtype=torch.int32, device=device) flashinfer_output = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q, k, v, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr, paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len, + # EXTRA METADATA batch_indices, positions, # CACHES @@ -849,16 +886,20 @@ def test_flashinfer_attention_with_paged_kvcache(seq_lengths, n_heads, dtype, de ), BATCH_SIZE * 1, ) + # Create batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor([0, 0, BATCH_SIZE], dtype=torch.int32, device=device) flashinfer_output_gen = torch.ops.auto_deploy.flashinfer_attention_mha_with_cache( # Q, K, V q_gen, k_gen, v_gen, - # METADATA + # STANDARD METADATA + batch_info, qo_indptr2, paged_kv_indptr2, paged_kv_indices2, paged_kv_last_page_len2, + # EXTRA METADATA batch_indices, positions, # CACHES diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py index 1a9c85621f..130e7ce651 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py @@ -245,11 +245,17 @@ class TestTorchBackendAttention: cache_loc = torch.arange(batch_size, device=self.device, dtype=torch.int32) if seq_len == 1: + # Generate phase: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor([0, 0, batch_size], device=self.device, dtype=torch.int32) seq_start = torch.arange(batch_size, device=self.device, dtype=torch.int32) q_flat = q.view(batch_size, seq_len, -1) k_flat = k.view(batch_size, seq_len, -1) v_flat = v.view(batch_size, seq_len, -1) else: + # Context phase: [num_prefill, num_prefill_tokens, num_decode] + batch_info = torch.tensor( + [batch_size, batch_size * seq_len, 0], device=self.device, dtype=torch.int32 + ) seq_start = torch.arange( 0, batch_size * seq_len, seq_len, device=self.device, dtype=torch.int32 ) @@ -261,6 +267,7 @@ class TestTorchBackendAttention: "q": q_flat, "k": k_flat, "v": v_flat, + "batch_info": batch_info, "seq_len": seq_len_tensor, "input_pos": input_positions, "cache_loc": cache_loc, @@ -274,15 +281,20 @@ class TestTorchBackendAttention: ): """Run torch backend attention operation with optional sinks parameter.""" return torch.ops.auto_deploy.torch_cached_attention_with_cache( + # Q, K, V data["q"], data["k"], data["v"], + # STANDARD METADATA + data["batch_info"], data["seq_len"], data["input_pos"], data["cache_loc"], - data["seq_start"], + data["seq_start"], # cu_seqlen + # CACHES data["k_cache"], data["v_cache"], + # CONSTANTS scale, sinks, sliding_window_size, @@ -463,26 +475,3 @@ class TestTorchBackendAttention: assert torch.allclose( generate_output, generate_reference_torch, atol=self.atol, rtol=self.rtol ), "Generate phase doesn't match reference" - - def test_metadata_preparation(self): - """Test metadata preparation operation.""" - batch_size, seq_len_val = 4, 8 - device = self.device - - # input_ids = torch.randint(0, 1000, (batch_size, seq_len_val), device=device) - position_ids = torch.arange(seq_len_val, device=device).expand(batch_size, -1) - seq_len = torch.full((batch_size,), seq_len_val, device=device, dtype=torch.int32) - input_pos = torch.zeros(batch_size, device=device, dtype=torch.int32) - cache_loc = torch.arange(batch_size, device=device, dtype=torch.int32) - pages_per_seq = torch.ones(batch_size, device=device, dtype=torch.int32) - slot_idx = torch.arange(batch_size, device=device, dtype=torch.int32) - - # Test metadata preparation - result = torch.ops.auto_deploy.torch_cached_attention_prepare_metadata( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, 128, 128 - ) - - # Verify result structure - assert len(result) == 4, "Metadata preparation should return 4 tensors" - assert all(torch.is_tensor(t) for t in result), "All results should be tensors" - assert result[0].shape[0] == batch_size, "First tensor should have batch_size elements" diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py index 3255e16bdb..035c3c463c 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py @@ -55,19 +55,23 @@ def test_generate_only_with_slot_mapping(conv_env): # Metadata (not used in generate-only op entry, but required by the interface) seq_len = torch.ones(batch, device=device, dtype=torch.int32) - seq_start = torch.zeros(batch, device=device, dtype=torch.int32) + cu_seqlen = torch.zeros(batch, device=device, dtype=torch.int32) # Snapshot caches for reference before running op (op mutates caches) gathered_before = conv_state_cache.clone().index_select(0, slot_idx) use_initial_states = torch.zeros(batch, device=device, dtype=torch.bool) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For generate-only: num_decode = batch, num_prefill = 0 + batch_info = torch.tensor([0, 0, batch], device=device, dtype=torch.int32) # Run cached op y = torch.ops.auto_deploy.torch_cached_causal_conv1d( # INPUTS x, w, b, - # METADATA + # STANDARD METADATA + batch_info, seq_len, - seq_start, + cu_seqlen, slot_idx, use_initial_states, # CACHES @@ -118,16 +122,22 @@ def test_context_flattened_and_state_writeback(conv_env): ) seq_len = torch.tensor(lens, device=device, dtype=torch.int32) - seq_start = torch.tensor([0, lens[0]], device=device, dtype=torch.int32) + cu_seqlen = torch.tensor([0, lens[0]], device=device, dtype=torch.int32) use_initial_states = torch.zeros(batch, device=device, dtype=torch.bool) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For context/prefill phase: num_prefill = len(lens), num_decode = 0 + num_seqs = len(lens) + num_prefill_tokens = sum(lens) + batch_info = torch.tensor([num_seqs, num_prefill_tokens, 0], device=device, dtype=torch.int32) y = torch.ops.auto_deploy.torch_cached_causal_conv1d( # INPUTS x, w, b, - # METADATA + # STANDARD METADATA + batch_info, seq_len, - seq_start, + cu_seqlen, slot_idx, use_initial_states, # CACHES @@ -163,26 +173,3 @@ def test_context_flattened_and_state_writeback(conv_env): ) assert torch.allclose(y, y_ref.to(y.dtype), atol=conv_env["atol"], rtol=conv_env["rtol"]) - - -def test_prepare_metadata(conv_env): - device = conv_env["device"] - - b, s = 4, 6 - # input_ids = torch.randint(0, 1000, (b, s), device=device) - position_ids = torch.arange(s, device=device).expand(b, -1) - seq_len = torch.tensor([2, 1, 0, 0], device=device, dtype=torch.int32) - input_pos = torch.tensor([0, 3, 0, 0], device=device, dtype=torch.int32) - cache_loc = torch.arange(b, device=device, dtype=torch.int32) - pages_per_seq = torch.ones(b, device=device, dtype=torch.int32) - slot_idx = torch.tensor([2, 0, 1, 3], device=device, dtype=torch.int32) - page_size = 128 - chunk_size = 128 - - out = torch.ops.auto_deploy.torch_causal_conv_prepare_metadata( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size - ) - assert len(out) == 4 - seq_len_s, seq_start, slot_s, use_initial_states = out - assert seq_len_s.numel() == 2 and slot_s.numel() == 2 - assert torch.all(seq_start == torch.tensor([0, 2], device=device, dtype=seq_start.dtype)) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py index 57ba4cd974..39e1a4c1f5 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py @@ -64,8 +64,11 @@ def test_generate_only_with_slot_mapping(mamba_env): # Metadata seq_len = torch.ones(batch, device=device, dtype=torch.int32) - seq_start = torch.zeros(batch, device=device, dtype=torch.int32) + cu_seqlen = torch.zeros(batch, device=device, dtype=torch.int32) use_initial_states = torch.zeros(batch, device=device, dtype=torch.bool) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For generate-only: num_decode = batch, num_prefill = 0 + batch_info = torch.tensor([0, 0, batch], device=device, dtype=torch.int32) # Snapshot caches for reference before running op (op mutates caches) gathered_before = ssm_state_cache.clone().index_select(0, slot_idx) @@ -79,9 +82,10 @@ def test_generate_only_with_slot_mapping(mamba_env): D, dt, dt_bias, - # METADATA + # STANDARD METADATA + batch_info, seq_len, - seq_start, + cu_seqlen, slot_idx, use_initial_states, # CACHES @@ -135,8 +139,13 @@ def test_context_flattened_and_state_writeback(mamba_env): ) seq_len = torch.tensor(lens, device=device, dtype=torch.int32) - seq_start = torch.tensor([0, lens[0]], device=device, dtype=torch.int32) + cu_seqlen = torch.tensor([0, lens[0]], device=device, dtype=torch.int32) use_initial_states = torch.zeros(batch, device=device, dtype=torch.bool) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + # For context/prefill phase: num_prefill = len(lens), num_decode = 0 + num_seqs = len(lens) + num_prefill_tokens = sum(lens) + batch_info = torch.tensor([num_seqs, num_prefill_tokens, 0], device=device, dtype=torch.int32) y = torch.ops.auto_deploy.torch_cached_ssm( # INPUTS hidden_states, @@ -146,9 +155,10 @@ def test_context_flattened_and_state_writeback(mamba_env): D, dt, dt_bias, - # METADATA + # STANDARD METADATA + batch_info, seq_len, - seq_start, + cu_seqlen, slot_idx, use_initial_states, # CACHES @@ -177,26 +187,3 @@ def test_context_flattened_and_state_writeback(mamba_env): assert torch.allclose(ssm_state_cache[slot_idx[i]].to(s_i.dtype), s_i, atol=atol, rtol=rtol) assert torch.allclose(y, y_ref.to(y.dtype), atol=atol, rtol=rtol) - - -def test_prepare_metadata(mamba_env): - device = mamba_env["device"] - - b, s = 4, 6 - # input_ids = torch.randint(0, 1000, (b, s), device=device) - position_ids = torch.arange(s, device=device).expand(b, -1) - seq_len = torch.tensor([2, 1, 0, 0], device=device, dtype=torch.int32) - input_pos = torch.tensor([0, 3, 0, 0], device=device, dtype=torch.int32) - cache_loc = torch.arange(b, device=device, dtype=torch.int32) - pages_per_seq = torch.ones(b, device=device, dtype=torch.int32) - slot_idx = torch.tensor([2, 0, 1, 3], device=device, dtype=torch.int32) - page_size = 128 - chunk_size = 128 - out = torch.ops.auto_deploy.torch_ssm_prepare_metadata( - position_ids, seq_len, input_pos, cache_loc, pages_per_seq, slot_idx, page_size, chunk_size - ) - # Returns a list of tensors from custom op API - assert len(out) == 4 - seq_len_s, seq_start, slot_s, use_initial_states = out - assert seq_len_s.numel() == 2 and slot_s.numel() == 2 - assert torch.all(seq_start == torch.tensor([0, 2], device=device, dtype=seq_start.dtype)) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py index 917cdbaca2..add5cd76be 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py @@ -121,7 +121,7 @@ def test_triton_context_flattened_and_state_writeback(mamba_env): ssm_state_cache_triton = ssm_state_cache_torch.clone() seq_len = torch.tensor(lens, device=device, dtype=torch.int32) - seq_start = torch.tensor([0, lens[0]], device=device, dtype=torch.int32) + cu_seqlen = torch.tensor([0, lens[0]], device=device, dtype=torch.int32) use_initial_states = torch.tensor([0] * batch, device=device).to(torch.bool) cu_seqlens = torch.cat( [ @@ -134,7 +134,8 @@ def test_triton_context_flattened_and_state_writeback(mamba_env): torch.arange(len(lens), device=device, dtype=torch.int32), seq_len, ).view(1, -1) - batch_info_tensor = torch.tensor([len(lens), sum(lens), 0], dtype=torch.int32) + # batch_info: [num_prefill, num_prefill_tokens, num_decode] + batch_info_tensor = torch.tensor([len(lens), sum(lens), 0], dtype=torch.int32, device=device) # Torch reference y_torch = torch.ops.auto_deploy.torch_cached_ssm( hidden_states, @@ -144,11 +145,15 @@ def test_triton_context_flattened_and_state_writeback(mamba_env): D, dt, dt_bias, + # STANDARD METADATA + batch_info_tensor, seq_len, - seq_start, + cu_seqlen, slot_idx, use_initial_states, + # CACHES ssm_state_cache_torch, + # CONSTANTS time_step_limit, chunk_size, ) @@ -162,15 +167,18 @@ def test_triton_context_flattened_and_state_writeback(mamba_env): D, dt, dt_bias, - seq_len, + # STANDARD METADATA + batch_info_tensor, + cu_seqlens, slot_idx, use_initial_states, - cu_seqlens, + # EXTRA METADATA None, # chunk indices None, # chunk offsets seq_idx_prefill, - batch_info_tensor, + # CACHES ssm_state_cache_triton, + # CONSTANTS time_step_limit, chunk_size, ) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py new file mode 100644 index 0000000000..de684fb6f8 --- /dev/null +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py @@ -0,0 +1,162 @@ +"""Unit tests for triton utility custom ops.""" + +import pytest +import torch + +# Import to register the custom op +from tensorrt_llm._torch.auto_deploy.custom_ops import triton_utils # noqa: F401 + + +def _reference_gather_scatter( + ungathered_input: torch.Tensor, + gather_ids: torch.Tensor, + mask_indices: torch.Tensor, + out: torch.Tensor, +) -> torch.Tensor: + """Reference implementation using pure PyTorch.""" + out_ref = out.clone() + gathered_values = ungathered_input[gather_ids] + out_ref[mask_indices] = gathered_values + return out_ref + + +@pytest.mark.parametrize("n_elements", [1, 16, 128, 256, 1024, 4096]) +@pytest.mark.parametrize("dtype", [torch.int32, torch.int64, torch.float16, torch.float32]) +def test_fused_gather_scatter_basic(n_elements, dtype): + """Test basic gather-scatter functionality with various sizes and dtypes.""" + device = "cuda" + + # Create source tensor with unique values for easy verification + ungathered_input = torch.arange(n_elements * 2, device=device, dtype=dtype) + + # Create gather indices (gather from various positions in ungathered_input) + gather_ids = torch.randint(0, n_elements * 2, (n_elements,), device=device, dtype=torch.int32) + + # Create scatter indices (scatter to various positions in output) + mask_indices = torch.randperm(n_elements, device=device, dtype=torch.int32) + + # Create output tensors + out = torch.zeros(n_elements, device=device, dtype=dtype) + out_ref = out.clone() + + # Compute reference + out_ref = _reference_gather_scatter(ungathered_input, gather_ids, mask_indices, out_ref) + + # Call the custom op + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input, gather_ids, mask_indices, out + ) + + # Verify + torch.testing.assert_close(out, out_ref, rtol=0, atol=0) + + +@pytest.mark.parametrize("batch_size", [1, 8, 32, 64]) +def test_fused_gather_scatter_for_input_ids(batch_size): + """Test the typical use case: rescattering input_ids for overlap scheduler.""" + device = "cuda" + + # Simulate ungathered input_ids from a sampler + vocab_size = 32000 + ungathered_input_ids = torch.randint( + 0, vocab_size, (batch_size,), device=device, dtype=torch.int32 + ) + + # Gather indices specify which tokens to pick from ungathered_input_ids + gather_ids = torch.randperm(batch_size, device=device, dtype=torch.int32) + + # Mask indices specify where to place them in the output + mask_indices = torch.arange(batch_size, device=device, dtype=torch.int32) + + # Output buffer + input_ids_out = torch.zeros(batch_size, device=device, dtype=torch.int32) + + # Reference implementation + ref_out = _reference_gather_scatter( + ungathered_input_ids, gather_ids, mask_indices, input_ids_out.clone() + ) + + # Custom op + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input_ids, gather_ids, mask_indices, input_ids_out + ) + + torch.testing.assert_close(input_ids_out, ref_out, rtol=0, atol=0) + + +def test_fused_gather_scatter_identity(): + """Test identity gather-scatter (indices are identity permutation).""" + device = "cuda" + n_elements = 64 + + ungathered_input = torch.arange(n_elements, device=device, dtype=torch.int32) + gather_ids = torch.arange(n_elements, device=device, dtype=torch.int32) + mask_indices = torch.arange(n_elements, device=device, dtype=torch.int32) + + out = torch.zeros(n_elements, device=device, dtype=torch.int32) + + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input, gather_ids, mask_indices, out + ) + + # Should be identity + torch.testing.assert_close(out, ungathered_input, rtol=0, atol=0) + + +def test_fused_gather_scatter_reverse(): + """Test reverse gather-scatter.""" + device = "cuda" + n_elements = 64 + + ungathered_input = torch.arange(n_elements, device=device, dtype=torch.int32) + # Gather in order but scatter in reverse + gather_ids = torch.arange(n_elements, device=device, dtype=torch.int32) + mask_indices = torch.arange(n_elements - 1, -1, -1, device=device, dtype=torch.int32) + + out = torch.zeros(n_elements, device=device, dtype=torch.int32) + + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input, gather_ids, mask_indices, out + ) + + # Output should be reversed + expected = torch.arange(n_elements - 1, -1, -1, device=device, dtype=torch.int32) + torch.testing.assert_close(out, expected, rtol=0, atol=0) + + +def test_fused_gather_scatter_duplicate_gather(): + """Test that gathering same index multiple times works correctly.""" + device = "cuda" + n_elements = 16 + + ungathered_input = torch.arange(100, 100 + n_elements, device=device, dtype=torch.int32) + # Gather the same index (0) for all positions + gather_ids = torch.zeros(n_elements, device=device, dtype=torch.int32) + mask_indices = torch.arange(n_elements, device=device, dtype=torch.int32) + + out = torch.zeros(n_elements, device=device, dtype=torch.int32) + + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input, gather_ids, mask_indices, out + ) + + # All values should be the first element of ungathered_input (100) + expected = torch.full((n_elements,), 100, device=device, dtype=torch.int32) + torch.testing.assert_close(out, expected, rtol=0, atol=0) + + +def test_fused_gather_scatter_single_element(): + """Test with a single element.""" + device = "cuda" + + ungathered_input = torch.tensor([42], device=device, dtype=torch.int32) + gather_ids = torch.tensor([0], device=device, dtype=torch.int32) + mask_indices = torch.tensor([0], device=device, dtype=torch.int32) + + out = torch.zeros(1, device=device, dtype=torch.int32) + + torch.ops.auto_deploy.triton_utils_fused_gather_scatter( + ungathered_input, gather_ids, mask_indices, out + ) + + torch.testing.assert_close(out, ungathered_input, rtol=0, atol=0) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py index 320dbdcfa6..79457fbfca 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py @@ -47,7 +47,9 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs): "transforms": { "resize_kv_cache": {"free_mem_ratio": 0.0001}, "insert_cached_attention": {"backend": "flashinfer"}, - "compile_model": {"backend": "torch-opt"}, + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9878 + # "compile_model": {"backend": "torch-opt"}, + "compile_model": {"backend": "torch-cudagraph"}, }, }, ), @@ -191,6 +193,8 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs): { "transforms": { "multi_stream_moe": {"stage": "compile", "enabled": True}, + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9878 + "compile_model": {"backend": "torch-cudagraph"}, }, }, ), diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py index ee20b7950f..d67d790a47 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py @@ -3,11 +3,10 @@ from typing import List, Optional import pytest import torch import torch.nn as nn -from _graph_test_helpers import SequenceEmbeddingInfo from _model_test_utils import GQA from _torch_test_utils import all_close -from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import CacheConfig +from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import CacheConfig, SequenceInfo from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.models.factory import ( FullModelExportInfo, @@ -42,19 +41,25 @@ class DummyFactory(ModelFactory): # Class that uses SDPA directly instead of the regular attention mechanism -class GQAWithSdpa(GQA): - """GQA model that uses SDPA directly instead of the regular attention.""" +class GQAWithSdpaAndEmbedding(GQA): + """GQA model with embedding layer that uses SDPA directly instead of the regular attention.""" def __init__( self, - *args, - **kwargs, + num_attention_heads: int, + hidden_size: int, + num_key_value_heads: int, + vocab_size: int = 1000, ): - super().__init__(*args, **kwargs) + super().__init__(num_attention_heads, hidden_size, num_key_value_heads) # Store the head dimensions explicitly - self.num_heads = args[0] # First argument is num_attention_heads - self.num_kv_heads = args[2] # Third argument is num_key_value_heads - self.head_dim = args[1] // self.num_heads # hidden_size / num_heads + self.num_heads = num_attention_heads + self.num_kv_heads = num_key_value_heads + self.head_dim = hidden_size // num_attention_heads + self.vocab_size = vocab_size + + # Add embedding layer + self.embed_tokens = nn.Embedding(vocab_size, hidden_size) if self.num_heads != self.num_kv_heads: self.num_key_value_groups = self.num_heads // self.num_kv_heads @@ -69,12 +74,15 @@ class GQAWithSdpa(GQA): Forward pass with input tokens and optional position ids. position_ids parameter added to match expected interface in kvcache.py """ - b, s, _ = input_ids.shape + # Embed input_ids: [b, s] -> [b, s, hidden] + x = self.embed_tokens(input_ids) + + b, s, _ = x.shape # Project input to q, k, v representations - q = self.q_proj(input_ids) # [b, s, n*h_d] - k = self.k_proj(input_ids) # [b, s, n_kv*h_d] - v = self.v_proj(input_ids) # [b, s, n_kv*h_d] + q = self.q_proj(x) # [b, s, n*h_d] + k = self.k_proj(x) # [b, s, n_kv*h_d] + v = self.v_proj(x) # [b, s, n_kv*h_d] # Reshape to [b, s, n, h_d] q = q.view(b, s, self.num_heads, self.head_dim) @@ -141,29 +149,29 @@ def test_sdpa_with_kv_cache(dtype, attn_backend, gqa_config): num_reset_steps = 2 num_random_steps = 4 max_position_embeddings = 128 + vocab_size = 1000 - # set up sequence+cache objects - ci = SequenceEmbeddingInfo( + # set up sequence+cache objects using standard SequenceInfo + ci = SequenceInfo( max_seq_len=max_position_embeddings, max_batch_size=batch_size, - hidden_size=hidden_size, - dtype=dtype, ) cm = CachedSequenceInterface(sequence_info=ci, device="cuda") - # Create the model with SDPA and wrap it in a fake factory - model = GQAWithSdpa( + # Create the model with embedding layer and SDPA, wrap it in a fake factory + model = GQAWithSdpaAndEmbedding( num_attention_heads, hidden_size, num_key_value_heads, + vocab_size=vocab_size, ).to(dtype=dtype, device="cuda") - # Create input tensor and position_ids - x = torch.rand(batch_size, seq_len, hidden_size).to(device="cuda", dtype=dtype) + # Create input token ids and position_ids + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") position_ids = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to("cuda") # Get the model's regular output - y_model = model(x, position_ids) # b, s, d + y_model = model(input_ids, position_ids) # b, s, d # Apply the transformation optimizer = InferenceOptimizer( @@ -187,9 +195,6 @@ def test_sdpa_with_kv_cache(dtype, attn_backend, gqa_config): "cleanup_input_constraints": { "stage": "post_export", }, - "update_in_out_nodes": { - "stage": "cache_init", - }, "insert_cached_attention": { "stage": "cache_init", "backend": attn_backend, @@ -215,25 +220,29 @@ def test_sdpa_with_kv_cache(dtype, attn_backend, gqa_config): # Test 1: Regular inference (all tokens at once) cm.info.reset() - y_no_cache = _call_and_unnest(x, 0) + y_no_cache = _call_and_unnest(input_ids, 0) assert all_close(y_model, y_no_cache, atol=atol, rtol=rtol) # Test 2: Autoregressive inference with KV cache cm.info.reset() y_with_cache = torch.empty_like(y_model) - for i_p in range(x.shape[1]): + for i_p in range(input_ids.shape[1]): # Just pass the current token - y_with_cache[:, i_p : i_p + 1] = _call_and_unnest(x[:, i_p : i_p + 1], i_p) + y_with_cache[:, i_p : i_p + 1] = _call_and_unnest(input_ids[:, i_p : i_p + 1], i_p) assert all_close(y_model, y_with_cache, atol=atol, rtol=rtol) # Test 3: Cache continuation after random tokens - for i_p in range(x.shape[1] - num_reset_steps, x.shape[1] - num_reset_steps + num_random_steps): - _call_and_unnest(torch.rand_like(x[:, :1]), i_p) + for i_p in range( + input_ids.shape[1] - num_reset_steps, + input_ids.shape[1] - num_reset_steps + num_random_steps, + ): + random_tokens = torch.randint(0, vocab_size, (batch_size, 1), device="cuda") + _call_and_unnest(random_tokens, i_p) # Continue inference from previous context cm.info.reset() - for i_p in range(x.shape[1] - num_reset_steps, x.shape[1]): - y_with_cache[:, i_p : i_p + 1] = _call_and_unnest(x[:, i_p : i_p + 1], i_p) + for i_p in range(input_ids.shape[1] - num_reset_steps, input_ids.shape[1]): + y_with_cache[:, i_p : i_p + 1] = _call_and_unnest(input_ids[:, i_p : i_p + 1], i_p) assert all_close(y_model, y_with_cache, atol=atol, rtol=rtol) # Test 4: Exportability of the transformed model From d5b9ad91c92665eaa94ca94fcccf403ba5fbad3b Mon Sep 17 00:00:00 2001 From: zackyoray Date: Fri, 12 Dec 2025 14:21:10 +0200 Subject: [PATCH 100/172] [None][feat] Upgrade NIXL to v0.8.0 (#9707) Signed-off-by: Yoray Zack <62789610+zackyoray@users.noreply.github.com> Signed-off-by: zackyoray Signed-off-by: Bo Deng Co-authored-by: Bo Deng --- docker/common/install_nixl.sh | 7 +++++-- jenkins/current_image_tags.properties | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh index d13b0f1757..2aa3168c8b 100644 --- a/docker/common/install_nixl.sh +++ b/docker/common/install_nixl.sh @@ -4,7 +4,7 @@ set -ex GITHUB_URL="https://github.com" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" -NIXL_VERSION="0.7.1" +NIXL_VERSION="0.8.0" NIXL_REPO="https://github.com/ai-dynamo/nixl.git" OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH @@ -18,11 +18,14 @@ fi if [ -n "${GITHUB_MIRROR}" ]; then export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" fi -pip3 install --no-cache-dir meson ninja pybind11 +pip3 install --no-cache-dir meson ninja pybind11 setuptools git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO} cd nixl +# Remove POSIX backend compilation from meson.build +sed -i "/^subdir('posix')/d" src/plugins/meson.build + CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1) if [[ -z "$CUDA_SO_PATH" ]]; then diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index ed5f0078bd..0787c16eb8 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512091705-9823 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512091705-9823 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512091705-9823 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512121105-9707 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512121105-9707 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512121105-9707 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512121105-9707 From af315d8ef1ca09c835affa6dc627c22b44187b42 Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 06:29:05 -0800 Subject: [PATCH 101/172] [TRTLLM-5972][chore] Load balance decode token KV cache with helix parallelism (#9757) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- .../pyexecutor/executor_request_queue.py | 1 + tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 ++ .../_torch/pyexecutor/model_engine.py | 23 +++++++++---------- .../_torch/pyexecutor/resource_manager.py | 14 +++++++---- .../test_lists/qa/llm_function_core.txt | 1 + .../test_lists/test-db/l0_dgx_b200.yml | 1 + .../executor/test_pytorch_model_engine.py | 1 + 7 files changed, 26 insertions(+), 17 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py index 2cbf5635a0..120c42dbd2 100644 --- a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py +++ b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py @@ -694,6 +694,7 @@ class ExecutorRequestQueue: position_ids=position_ids_this_rank, ) req.total_input_len_cp = input_len + req.seqlen_this_rank_cp = len(input_ids_this_rank) req_with_children.append(req) if req.child_requests: req_with_children.extend(req.child_requests) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 2831438256..5f81b94a01 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -489,6 +489,8 @@ class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): self.py_max_new_tokens = self.max_new_tokens self.py_min_length = self.sampling_config.min_length self.py_helix_is_inactive_rank = False + self.seqlen_this_rank_cp = 0 + self.total_input_len_cp = 0 self.py_batch_idx = None self.py_draft_pages_allocated = 0 self.py_rewind_len = 0 diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 5dfbe7c9a2..6d49804e20 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -568,13 +568,12 @@ class PyTorchModelEngine(ModelEngine): # Reset the global cuda graph dummy request to None in warmup. self.cuda_graph_runner.padding_dummy_request = None - cp_type = self.mapping.cp_config.get('cp_type', None) - if cp_type is not None: - if cp_type in [CpType.ULYSSES, CpType.STAR]: - logger.info( - "[ModelEngine::warmup] Skipping warmup for cp_type: ", - cp_type.name) - return + if self.mapping.cp_size > 1: + cp_type = self.mapping.cp_config.get("cp_type", None) + logger.info( + f"[ModelEngine::warmup] Skipping warmup for cp_type: {None if cp_type is None else cp_type.name}." + ) + return self._run_torch_compile_warmup(resource_manager) self._run_autotuner_warmup(resource_manager) @@ -1671,12 +1670,12 @@ class PyTorchModelEngine(ModelEngine): # Warmup doesn't have `total_input_len_cp` set because merge_helix_requests is not called. if not self.is_warmup and not request.is_cuda_graph_dummy: position_id = request.total_input_len_cp + request.py_decoding_iter - 1 - # TODO: [TRTLLM-5972] Lift the limitation that last rank is always the active one for helix. - if self.mapping.cp_rank == self.mapping.cp_size - 1: - past_seen_token_num = request.orig_prompt_len + request.py_decoding_iter - 1 + if request.py_helix_is_inactive_rank: + past_seen_token_num = request.seqlen_this_rank_cp else: - # past_seen_token_num doesn't grow on inactive ranks. - past_seen_token_num = request.orig_prompt_len + # Discount the token added to active rank in resource manager as it hasn't + # been previously seen. + past_seen_token_num = request.seqlen_this_rank_cp - 1 position_ids.append(position_id) num_cached_tokens_per_seq.append(past_seen_token_num) diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index a70b35dfcf..bd1d197786 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -468,13 +468,17 @@ class KVCacheManager(BaseResourceManager): req, block_ids) for req in generation_batch: - # TODO: [TRTLLM-5972] Lift the limitation that last rank is always the active one for helix. if self.mapping.has_cp_helix(): - if self.mapping.cp_rank != self.mapping.cp_size - 1: + # Distribute the decode blocks across CP ranks in a round-robin manner. + decode_block_id = (req.py_decoding_iter - + 1) // self.tokens_per_block + if decode_block_id % self.mapping.cp_size == self.mapping.cp_rank: + req.py_helix_is_inactive_rank = False + req.seqlen_this_rank_cp += 1 + else: req.py_helix_is_inactive_rank = True - # Skip allocating KV cache at decode for inactive helix ranks. - if req.py_helix_is_inactive_rank: - continue + # Skip allocating KV cache at decode for inactive helix ranks. + continue self.impl.add_token(req.py_request_id) for _ in range(get_draft_token_length(req)): self.impl.add_token(req.py_request_id) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 47a017378c..5b5ad88d3b 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -524,6 +524,7 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 89f6598da3..ccd23bdf08 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -189,3 +189,4 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2 + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix diff --git a/tests/unittest/_torch/executor/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py index ca75cbb351..76f7262930 100644 --- a/tests/unittest/_torch/executor/test_pytorch_model_engine.py +++ b/tests/unittest/_torch/executor/test_pytorch_model_engine.py @@ -407,6 +407,7 @@ class PyTorchModelEngineTestCase(unittest.TestCase): req.sampling_config.beam_width = 1 req.py_multimodal_data = {} req.total_input_len_cp = prompt_lens[idx] * 2 + req.seqlen_this_rank_cp = prompt_lens[idx] req.py_decoding_iter = 1 gen_requests.append(req) scheduled_requests.generation_requests = gen_requests From 9df4dad3b621d1a796e0baac4a6bb6b0c0739881 Mon Sep 17 00:00:00 2001 From: Yihan Wang Date: Fri, 12 Dec 2025 23:32:15 +0800 Subject: [PATCH 102/172] [None][fix] Introduce inline namespace to avoid symbol collision (#9541) Signed-off-by: Yihan Wang --- .gitattributes | 2 + .gitignore | 1 + benchmarks/cpp/utils/utils.cpp | 15 +- benchmarks/cpp/utils/utils.h | 9 +- cpp/include/tensorrt_llm/common/algorithm.h | 7 +- cpp/include/tensorrt_llm/common/arrayView.h | 10 +- cpp/include/tensorrt_llm/common/assert.h | 17 +- .../tensorrt_llm/common/bindingUtils.h | 10 +- cpp/include/tensorrt_llm/common/config.h | 62 + .../tensorrt_llm/common/cudaFp8Utils.h | 9 +- .../tensorrt_llm/common/cudaProfilerUtils.h | 12 +- cpp/include/tensorrt_llm/common/cudaUtils.h | 9 +- cpp/include/tensorrt_llm/common/dataType.h | 10 +- cpp/include/tensorrt_llm/common/logger.h | 13 +- cpp/include/tensorrt_llm/common/optionalRef.h | 10 +- .../tensorrt_llm/common/quantization.h | 9 +- cpp/include/tensorrt_llm/common/stringUtils.h | 9 +- .../tensorrt_llm/common/tllmException.h | 12 +- cpp/include/tensorrt_llm/common/utils.h | 10 +- .../tensorrt_llm/kernels/archCondition.h | 10 +- .../tensorrt_llm/kernels/decodingCommon.h | 9 +- .../tensorrt_llm/kernels/kvCacheIndex.h | 9 +- .../tensorrt_llm/kernels/kvCachePartialCopy.h | 8 +- cpp/kernels/fmha_v2/setup.py | 230 +- cpp/kernels/xqa/gen_cpp_header.py | 7 +- cpp/kernels/xqa/gen_cubins.py | 8 +- cpp/tensorrt_llm/common/assert.cpp | 2 +- cpp/tensorrt_llm/common/attentionOp.cpp | 1 + cpp/tensorrt_llm/common/attentionOp.h | 9 +- cpp/tensorrt_llm/common/cublasMMWrapper.cpp | 7 +- cpp/tensorrt_llm/common/cublasMMWrapper.h | 7 +- cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh | 8 +- cpp/tensorrt_llm/common/cudaBufferUtils.cuh | 8 +- cpp/tensorrt_llm/common/cudaDriverWrapper.cpp | 10 +- cpp/tensorrt_llm/common/cudaDriverWrapper.h | 8 +- cpp/tensorrt_llm/common/cudaFp8Utils.cu | 8 +- cpp/tensorrt_llm/common/cudaProfilerUtils.cpp | 9 +- cpp/tensorrt_llm/common/cudaTypeUtils.cuh | 8 +- .../common/customAllReduceUtils.h | 9 +- cpp/tensorrt_llm/common/envUtils.cpp | 9 +- cpp/tensorrt_llm/common/envUtils.h | 9 +- cpp/tensorrt_llm/common/lamportUtils.cuh | 8 +- cpp/tensorrt_llm/common/logger.cpp | 9 +- cpp/tensorrt_llm/common/mathUtils.h | 8 +- cpp/tensorrt_llm/common/mcastDevMemUtils.cpp | 12 +- cpp/tensorrt_llm/common/mcastDevMemUtils.h | 14 +- cpp/tensorrt_llm/common/memoryUtils.cu | 8 +- cpp/tensorrt_llm/common/memoryUtils.h | 8 +- cpp/tensorrt_llm/common/ncclUtils.h | 9 +- cpp/tensorrt_llm/common/nvtxUtils.h | 8 +- cpp/tensorrt_llm/common/opUtils.cpp | 3 + cpp/tensorrt_llm/common/opUtils.h | 9 +- cpp/tensorrt_llm/common/quantTypeUtils.cuh | 8 +- cpp/tensorrt_llm/common/reduceKernelUtils.cuh | 8 +- cpp/tensorrt_llm/common/safetensors.cpp | 9 +- cpp/tensorrt_llm/common/safetensors.h | 9 +- cpp/tensorrt_llm/common/stlUtils.h | 9 +- cpp/tensorrt_llm/common/stringUtils.cpp | 9 +- cpp/tensorrt_llm/common/timestampUtils.cpp | 9 +- cpp/tensorrt_llm/common/timestampUtils.h | 9 +- cpp/tensorrt_llm/common/tllmException.cpp | 9 +- cpp/tensorrt_llm/common/workspace.h | 9 +- .../cutlass_extensions/compute_occupancy.h | 8 +- .../cutlass_extensions/epilogue_helpers.h | 8 +- .../include/cutlass_extensions/gemm_configs.h | 8 +- cpp/tensorrt_llm/executor/executorImpl.cpp | 25 +- .../kernels/IndexerKCacheScatter.h | 7 +- cpp/tensorrt_llm/kernels/IndexerTopK.h | 9 +- cpp/tensorrt_llm/kernels/attentionMask.cu | 8 +- cpp/tensorrt_llm/kernels/attentionMask.h | 8 +- cpp/tensorrt_llm/kernels/banBadWords.cu | 8 +- cpp/tensorrt_llm/kernels/banBadWords.h | 8 +- cpp/tensorrt_llm/kernels/banRepeatNgram.cu | 7 +- cpp/tensorrt_llm/kernels/banRepeatNgram.h | 8 +- cpp/tensorrt_llm/kernels/beamSearchKernels.cu | 8 +- cpp/tensorrt_llm/kernels/beamSearchKernels.h | 14 +- .../beamSearchKernels1024.cu | 8 +- .../beamSearchKernels/beamSearchKernels128.cu | 8 +- .../beamSearchKernels/beamSearchKernels16.cu | 8 +- .../beamSearchKernels/beamSearchKernels256.cu | 8 +- .../beamSearchKernels/beamSearchKernels32.cu | 8 +- .../beamSearchKernels/beamSearchKernels4.cu | 8 +- .../beamSearchKernels/beamSearchKernels512.cu | 8 +- .../beamSearchKernels/beamSearchKernels64.cu | 8 +- .../beamSearchKernels/beamSearchKernels8.cu | 8 +- .../beamSearchKernelsTemplate.h | 9 +- .../buildRelativeAttentionBiasKernel.cu | 9 +- .../buildRelativeAttentionBiasKernel.h | 8 +- .../kernels/causalConv1d/causalConv1d.cu | 9 +- .../kernels/causalConv1d/causalConv1d.h | 9 +- .../allReduceFusionKernels.cu | 9 +- .../allReduceFusionKernels.h | 11 +- .../allReduceWorkspace.cu | 9 +- .../communicationKernels/allReduceWorkspace.h | 9 +- .../customLowPrecisionAllReduceKernels.cu | 9 +- .../customLowPrecisionAllReduceKernels.h | 9 +- .../mnnvlAllreduceKernels.cu | 9 +- .../mnnvlAllreduceKernels.h | 7 +- .../moeAllReduceFusionKernels.cu | 9 +- .../moeAllReduceFusionKernels.h | 11 +- .../moeAlltoAllKernels.cu | 9 +- .../communicationKernels/moeAlltoAllKernels.h | 9 +- .../fmhaPackedMask.cu | 8 +- .../fmhaPackedMask.h | 8 +- .../fmhaRunner.cpp | 8 +- .../fmhaRunner.h | 8 +- .../fused_multihead_attention_common.h | 11 +- .../fused_multihead_attention_v2.cpp | 14 +- .../fused_multihead_attention_v2.h | 9 +- .../tmaDescriptor.h | 13 +- cpp/tensorrt_llm/kernels/cumsumLastDim.cu | 8 +- cpp/tensorrt_llm/kernels/cumsumLastDim.h | 8 +- .../kernels/customAllReduceKernels.cu | 9 +- .../kernels/customAllReduceKernels.h | 11 +- .../kernels/customMoeRoutingKernels.cu | 9 +- .../kernels/customMoeRoutingKernels.h | 9 +- .../kernels/cuteDslKernels/moeUtils.cu | 9 +- .../kernels/cuteDslKernels/moeUtils.h | 9 +- .../allreduce_gemm_impl_sm100.h | 9 +- .../allreduce_gemm/allreduce_gemm_impl_sm90.h | 9 +- .../allreduce_gemm/allreduce_gemm_runner.cu | 10 +- .../cutlass_kernels/cutlass_heuristic.cpp | 8 +- .../cutlass_kernels/cutlass_heuristic.h | 8 +- .../cutlass_kernels/cutlass_preprocessors.cpp | 8 +- .../cutlass_kernels/cutlass_preprocessors.h | 8 +- .../cutlass_kernels/cutlass_type_conversion.h | 8 +- .../cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu | 8 +- .../cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu | 8 +- .../cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu | 8 +- .../fp4_gemm/fp4_gemm_template.h | 9 +- .../mxfp8_mxfp4_gemm_template_sm100.h | 8 +- .../nvfp4_nvfp4_gemm_template_sm100.h | 11 +- .../nvfp4_nvfp4_gemm_template_sm120.h | 11 +- .../fp8_blockscale_gemm.cu | 9 +- .../fp8_blockscale_gemm/fp8_blockscale_gemm.h | 11 +- .../fp8_blockscale_gemm_kernel.cuh | 9 +- .../fp8_blockscale_mma_utils.cuh | 11 +- .../fp8_blockscale_tma_utils.cuh | 11 +- .../fp8_rowwise_gemm/fp8_rowwise_gemm.h | 8 +- .../fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu | 8 +- .../fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu | 8 +- .../fp8_rowwise_gemm_kernel_template_sm100.h | 9 +- .../fp8_rowwise_gemm_kernel_template_sm89.h | 8 +- .../fp8_rowwise_gemm_kernel_template_sm90.h | 8 +- .../fp8_rowwise_gemm_template.h | 8 +- .../bf16_int4_gemm_fg_scalebias.cu | 8 +- .../bf16_int4_gemm_fg_scaleonly.cu | 8 +- .../fpA_intB_gemm/bf16_int4_gemm_per_col.cu | 8 +- .../bf16_int8_gemm_fg_scalebias.cu | 8 +- .../bf16_int8_gemm_fg_scaleonly.cu | 8 +- .../fpA_intB_gemm/bf16_int8_gemm_per_col.cu | 8 +- ...m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu | 8 +- ...e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu | 8 +- ...m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu | 8 +- ...e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu | 8 +- .../e4m3_int4_gemm_per_col_f16_out_f16.cu | 8 +- .../fp16_int4_gemm_fg_scalebias.cu | 8 +- .../fp16_int4_gemm_fg_scaleonly.cu | 8 +- .../fpA_intB_gemm/fp16_int4_gemm_per_col.cu | 8 +- .../fp16_int8_gemm_fg_scalebias.cu | 8 +- .../fp16_int8_gemm_fg_scaleonly.cu | 8 +- .../fpA_intB_gemm/fp16_int8_gemm_per_col.cu | 8 +- .../fpA_intB_gemm/fpA_intB_gemm.h | 8 +- .../fpA_intB_gemm/fpA_intB_gemm_template.h | 8 +- .../fpA_intB_gemm_template_sm90.h | 8 +- .../launchers/fpA_intB_launcher_sm90.h | 8 +- .../launchers/fpA_intB_launcher_sm90.inl | 8 +- .../fused_gated_gemm/fused_gated_gemm.h | 8 +- .../fused_gated_gemm_kernel_template_sm90.h | 8 +- .../fused_gated_gemm_template.h | 8 +- .../fused_gated_gemm/gemm_swiglu_e4m3.cu | 8 +- .../include/allreduce_gemm_runner.h | 9 +- .../kernels/cutlass_kernels/include/common.h | 10 +- .../cutlass_kernels/include/fp4_gemm.h | 8 +- .../include/low_latency_gemm.h | 9 +- .../include/moe_gemm_kernels.h | 9 +- .../cutlass_kernels/include/moe_kernels.h | 9 +- .../include/moe_util_kernels.h | 9 +- .../cutlass_kernels/int8_gemm/int8_gemm.h | 9 +- .../int8_gemm/int8_gemm_bf16.cu | 8 +- .../int8_gemm/int8_gemm_fp16.cu | 8 +- .../int8_gemm/int8_gemm_fp32.cu | 8 +- .../int8_gemm/int8_gemm_int32.cu | 8 +- .../int8_gemm/int8_gemm_template.h | 8 +- .../fp8_low_latency_gemm_template.h | 6 +- .../low_latency_fp8_gemm_bf16.cu | 8 +- .../low_latency_fp8_gemm_fp16.cu | 8 +- .../low_latency_fp8_gemm_fp32.cu | 8 +- .../launchers/fused_moe_gemm_launcher_sm80.h | 10 +- .../fused_moe_gemm_launcher_sm80.inl | 9 +- .../launchers/moe_gemm_tma_ws_launcher.h | 9 +- .../launchers/moe_gemm_tma_ws_launcher.inl | 8 +- .../moe_gemm_tma_ws_mixed_input_launcher.h | 8 +- .../moe_gemm_tma_ws_mixed_input_launcher.inl | 8 +- .../moe_gemm/moe_gemm_kernels_bf16_bf16.cu | 9 +- .../moe_gemm/moe_gemm_kernels_bf16_fp4.cu | 9 +- .../moe_gemm/moe_gemm_kernels_bf16_fp8.cu | 9 +- .../moe_gemm/moe_gemm_kernels_bf16_uint4.cu | 9 +- .../moe_gemm/moe_gemm_kernels_bf16_uint8.cu | 9 +- .../moe_gemm/moe_gemm_kernels_fp16_fp16.cu | 7 +- .../moe_gemm/moe_gemm_kernels_fp16_fp4.cu | 7 +- .../moe_gemm/moe_gemm_kernels_fp16_uint4.cu | 7 +- .../moe_gemm/moe_gemm_kernels_fp16_uint8.cu | 7 +- .../moe_gemm/moe_gemm_kernels_fp32_fp32.cu | 7 +- .../moe_gemm/moe_gemm_kernels_fp4_fp4.cu | 9 +- .../moe_gemm/moe_gemm_kernels_fp8_fp4.cu | 9 +- .../moe_gemm/moe_gemm_kernels_fp8_fp8.cu | 9 +- .../moe_gemm/moe_gemm_kernels_fp8_uint4.cu | 9 +- .../moe_gemm/moe_gemm_template_dispatch.h | 13 +- .../moe_gemm_template_dispatch_tma_ws.h | 11 +- ...emm_template_dispatch_tma_ws_mixed_dtype.h | 9 +- .../moe_gemm_tma_warp_specialized_input.cu | 9 +- .../cutlass_kernels/moe_gemm/moe_kernels.cu | 9 +- .../cutlass_kernels/moe_gemm/moe_kernels.cuh | 9 +- .../moe_tma_warp_specialized_traits.h | 9 +- .../python/generate_kernels.py | 6 +- .../decoderMaskedMultiheadAttention.cu | 8 +- .../kernels/decoderMaskedMultiheadAttention.h | 8 +- .../cubin/xqa_kernel_cubin.cpp | 3 + .../cubin/xqa_kernel_cubin.h | 1896 +----- .../decoderMaskedMultiheadAttentionLaunch.h | 8 +- .../decoderMaskedMultiheadAttentionTemplate.h | 8 +- .../decoderXQAConstants.h | 8 +- .../decoderXQAImpl.cpp | 8 +- .../decoderXQAImpl.h | 8 +- .../decoderXQAImplCommon.cpp | 9 +- .../decoderXQAImplCommon.h | 8 +- .../decoderXQAImplJIT/compileEngine.cpp | 8 +- .../decoderXQAImplJIT/compileEngine.h | 8 +- .../decoderXQAImplJIT/cubinObj.cpp | 9 +- .../decoderXQAImplJIT/cubinObj.h | 8 +- .../decoderXQAImplJIT/cubinObjRegistry.h | 9 +- .../decoderXQAImplJIT/decoderXQAImplJIT.cpp | 9 +- .../decoderXQAImplJIT/decoderXQAImplJIT.h | 8 +- .../decoderXQAImplJIT/kernelUtils.cpp | 8 +- .../decoderXQAImplJIT/kernelUtils.h | 8 +- .../decoderXQAImplJIT/serializationUtils.h | 10 +- .../decoderXQAImplPrecompiled.cpp | 11 +- .../decoderXQAImplPrecompiled.h | 8 +- .../decoderXQARunner.cpp | 7 +- .../decoderXQARunner.h | 8 +- ...decoderMaskedMultiheadAttention104_bf16.cu | 8 +- ...ecoderMaskedMultiheadAttention104_float.cu | 8 +- ...decoderMaskedMultiheadAttention104_half.cu | 8 +- ...decoderMaskedMultiheadAttention112_bf16.cu | 8 +- ...ecoderMaskedMultiheadAttention112_float.cu | 8 +- ...decoderMaskedMultiheadAttention112_half.cu | 8 +- ...decoderMaskedMultiheadAttention128_bf16.cu | 8 +- ...headAttention128_bf16_block_sparse_attn.cu | 8 +- ...ttention128_bf16_implicit_relative_attn.cu | 8 +- ...ultiheadAttention128_bf16_qk_tanh_scale.cu | 8 +- ...ecoderMaskedMultiheadAttention128_float.cu | 8 +- ...eadAttention128_float_block_sparse_attn.cu | 8 +- ...tention128_float_implicit_relative_attn.cu | 8 +- ...ltiheadAttention128_float_qk_tanh_scale.cu | 8 +- ...decoderMaskedMultiheadAttention128_half.cu | 8 +- ...headAttention128_half_block_sparse_attn.cu | 8 +- ...ttention128_half_implicit_relative_attn.cu | 8 +- ...ultiheadAttention128_half_qk_tanh_scale.cu | 8 +- ...decoderMaskedMultiheadAttention144_bf16.cu | 8 +- ...ecoderMaskedMultiheadAttention144_float.cu | 8 +- ...decoderMaskedMultiheadAttention144_half.cu | 8 +- ...decoderMaskedMultiheadAttention160_bf16.cu | 8 +- ...ecoderMaskedMultiheadAttention160_float.cu | 8 +- ...decoderMaskedMultiheadAttention160_half.cu | 8 +- ...decoderMaskedMultiheadAttention192_bf16.cu | 8 +- ...ecoderMaskedMultiheadAttention192_float.cu | 8 +- ...decoderMaskedMultiheadAttention192_half.cu | 8 +- ...decoderMaskedMultiheadAttention224_bf16.cu | 8 +- ...ecoderMaskedMultiheadAttention224_float.cu | 8 +- ...decoderMaskedMultiheadAttention224_half.cu | 8 +- ...decoderMaskedMultiheadAttention256_bf16.cu | 8 +- ...ultiheadAttention256_bf16_qk_tanh_scale.cu | 8 +- ...ecoderMaskedMultiheadAttention256_float.cu | 8 +- ...ltiheadAttention256_float_qk_tanh_scale.cu | 8 +- ...decoderMaskedMultiheadAttention256_half.cu | 8 +- ...ultiheadAttention256_half_qk_tanh_scale.cu | 8 +- .../decoderMaskedMultiheadAttention32_bf16.cu | 8 +- ...Attention32_bf16_implicit_relative_attn.cu | 8 +- ...decoderMaskedMultiheadAttention32_float.cu | 8 +- ...ttention32_float_implicit_relative_attn.cu | 8 +- .../decoderMaskedMultiheadAttention32_half.cu | 8 +- ...Attention32_half_implicit_relative_attn.cu | 8 +- .../decoderMaskedMultiheadAttention48_bf16.cu | 8 +- ...decoderMaskedMultiheadAttention48_float.cu | 8 +- .../decoderMaskedMultiheadAttention48_half.cu | 8 +- .../decoderMaskedMultiheadAttention64_bf16.cu | 8 +- ...Attention64_bf16_implicit_relative_attn.cu | 8 +- ...decoderMaskedMultiheadAttention64_float.cu | 8 +- ...ttention64_float_implicit_relative_attn.cu | 8 +- .../decoderMaskedMultiheadAttention64_half.cu | 8 +- ...Attention64_half_implicit_relative_attn.cu | 8 +- .../decoderMaskedMultiheadAttention80_bf16.cu | 8 +- ...decoderMaskedMultiheadAttention80_float.cu | 8 +- .../decoderMaskedMultiheadAttention80_half.cu | 8 +- .../decoderMaskedMultiheadAttention96_bf16.cu | 8 +- ...decoderMaskedMultiheadAttention96_float.cu | 8 +- .../decoderMaskedMultiheadAttention96_half.cu | 8 +- .../tensorMapUtils.cpp | 9 +- .../tensorMapUtils.h | 8 +- .../xqaParams.h | 8 +- .../decoderMaskedMultiheadAttentionUtils.h | 8 +- cpp/tensorrt_llm/kernels/decodingCommon.cu | 13 +- cpp/tensorrt_llm/kernels/decodingKernels.cu | 12 +- cpp/tensorrt_llm/kernels/decodingKernels.h | 12 +- cpp/tensorrt_llm/kernels/delayStream.cu | 9 +- cpp/tensorrt_llm/kernels/delayStream.h | 9 +- cpp/tensorrt_llm/kernels/doraScaling.cu | 9 +- cpp/tensorrt_llm/kernels/doraScaling.h | 8 +- .../dsv3MinLatencyKernels/dsv3FusedAGemm.cu | 9 +- .../dsv3MinLatencyKernels/dsv3FusedAGemm.h | 9 +- .../dsv3MinLatencyKernels/dsv3RouterGemm.cu | 10 +- .../dsv3MinLatencyKernels/dsv3RouterGemm.h | 9 +- cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp | 9 +- cpp/tensorrt_llm/kernels/fmhaDispatcher.h | 9 +- .../fusedLayernormKernels/fp4_converter.cuh | 9 +- .../fusedLayernormKernels/layernorm_param.h | 9 +- .../low_latency_layernorm.cuh | 9 +- .../fusedLayernormKernels/ws_layernorm.cuh | 9 +- .../fusedLayernormKernels/ws_layernorm.h | 9 +- .../ws_layernorm_fp4_traits.cu | 9 +- .../kernels/fusedMoeCommKernels.cu | 12 +- .../kernels/fusedMoeCommKernels.h | 8 +- .../kernels/fusedQKNormRopeKernel.cu | 16 +- .../kernels/fusedQKNormRopeKernel.h | 8 +- cpp/tensorrt_llm/kernels/gptKernels.cu | 8 +- cpp/tensorrt_llm/kernels/gptKernels.h | 8 +- cpp/tensorrt_llm/kernels/groupGemm.cu | 9 +- cpp/tensorrt_llm/kernels/groupGemm.h | 7 +- .../groupRmsNormKernels.cu | 9 +- .../groupRmsNormKernels/groupRmsNormKernels.h | 11 +- cpp/tensorrt_llm/kernels/helixKernels.cu | 8 +- cpp/tensorrt_llm/kernels/helixKernels.h | 8 +- .../kernels/indexerKCacheScatter.cu | 9 +- cpp/tensorrt_llm/kernels/indexerTopK.cu | 9 +- ...llm_internal_cutlass_kernels_static.tar.xz | 4 +- .../aarch64-linux-gnu/version.txt | 4 +- .../include/allreduce_gemm_runner.h | 37 +- .../include/fp4_gemm.h | 8 +- .../include/low_latency_gemm.h | 9 +- .../include/low_latency_gemm_swiglu.h | 8 +- .../include/moe_gemm_kernels.h | 12 +- .../include/moe_kernels.h | 134 +- ...llm_internal_cutlass_kernels_static.tar.xz | 4 +- .../x86_64-linux-gnu/version.txt | 4 +- .../kernels/kvCachePartialCopy.cu | 8 +- cpp/tensorrt_llm/kernels/kvCacheUtils.h | 11 +- cpp/tensorrt_llm/kernels/layernormKernels.cu | 8 +- cpp/tensorrt_llm/kernels/layernormKernels.h | 8 +- .../llama4Bf16Bf16Gemm.cu | 9 +- .../llama4Bf16Bf16Gemm.h | 9 +- .../llama4Fp8Bf16Gemm.cu | 9 +- .../llama4Fp8Bf16Gemm.h | 9 +- ...Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh | 9 +- .../llama4Fp8Bf16GemmPerBlockTemplate.cuh | 9 +- .../llama4Fp8Bf16GemmPerWarpTemplate.cuh | 9 +- .../llama4Fp8Fp8GemmSwiGLU.cu | 9 +- .../llama4Fp8Fp8GemmSwiGLU.h | 9 +- ...llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh | 9 +- .../llama4MinLatencyMoEOp.cu | 9 +- .../llama4MinLatencyMoEOp.h | 9 +- .../llama4MinLatencyKernels/llama4Utils.cuh | 9 +- cpp/tensorrt_llm/kernels/logitsBitmask.cu | 8 +- cpp/tensorrt_llm/kernels/logitsBitmask.h | 8 +- cpp/tensorrt_llm/kernels/lookupKernels.cu | 8 +- cpp/tensorrt_llm/kernels/lookupKernels.h | 8 +- cpp/tensorrt_llm/kernels/lora/dora.h | 9 +- cpp/tensorrt_llm/kernels/lora/lora.cpp | 13 +- cpp/tensorrt_llm/kernels/lora/lora.h | 9 +- cpp/tensorrt_llm/kernels/lruKernel.cu | 8 +- cpp/tensorrt_llm/kernels/lruKernel.h | 8 +- .../kernels/mambaConv1dKernels.cu | 8 +- cpp/tensorrt_llm/kernels/mambaConv1dKernels.h | 8 +- cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu | 8 +- .../kernels/mlaChunkedPrefill.cuh | 8 +- cpp/tensorrt_llm/kernels/mlaKernels.cu | 7 +- cpp/tensorrt_llm/kernels/mlaKernels.h | 8 +- cpp/tensorrt_llm/kernels/moeAlignKernels.cu | 10 +- cpp/tensorrt_llm/kernels/moeAlignKernels.h | 9 +- .../kernels/moeCommKernelsCommon.h | 8 +- .../moeLoadBalance/moeLoadBalanceCommon.h | 8 +- .../moeLoadBalance/moeLoadBalanceKernels.cu | 8 +- .../moeLoadBalance/moeLoadBalanceKernels.h | 8 +- cpp/tensorrt_llm/kernels/moePrepareKernels.cu | 9 +- cpp/tensorrt_llm/kernels/moePrepareKernels.h | 9 +- cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh | 9 +- cpp/tensorrt_llm/kernels/moe_utils.cuh | 8 +- .../kernels/multiHeadAttentionCommon.h | 8 +- cpp/tensorrt_llm/kernels/noAuxTcKernels.cu | 9 +- cpp/tensorrt_llm/kernels/noAuxTcKernels.h | 9 +- cpp/tensorrt_llm/kernels/penaltyKernels.cu | 13 +- cpp/tensorrt_llm/kernels/penaltyKernels.h | 9 +- cpp/tensorrt_llm/kernels/penaltyTypes.h | 8 +- .../kernels/preQuantScaleKernel.cu | 8 +- .../kernels/preQuantScaleKernel.h | 8 +- cpp/tensorrt_llm/kernels/qserveGemm.h | 8 +- .../kernels/qserveGemmPerChannel.cu | 8 +- .../kernels/qserveGemmPerGroup.cu | 7 +- cpp/tensorrt_llm/kernels/quantization.cu | 8 +- cpp/tensorrt_llm/kernels/quantization.cuh | 8 +- cpp/tensorrt_llm/kernels/quantization.h | 8 +- .../kernels/recoverFromRingAtten.cu | 8 +- .../kernels/recoverFromRingAtten.h | 8 +- cpp/tensorrt_llm/kernels/rmsnormKernels.cu | 8 +- cpp/tensorrt_llm/kernels/rmsnormKernels.h | 8 +- .../kernels/sageAttentionKernels.cu | 8 +- .../kernels/sageAttentionKernels.h | 8 +- .../kernels/samplingAirTopPKernels.cu | 9 +- .../kernels/samplingTopKKernels.cu | 10 +- .../kernels/samplingTopKKernels.h | 9 +- .../kernels/samplingTopPKernels.cu | 10 +- .../kernels/samplingTopPKernels.h | 9 +- .../kernels/selectiveScan/bmmchunk.h | 7 +- .../kernels/selectiveScan/chunkcumsum.h | 7 +- .../kernels/selectiveScan/chunkscan.h | 7 +- .../kernels/selectiveScan/chunkstate.h | 7 +- .../instantiation/bmmchunk_bf16.cu | 7 +- .../instantiation/bmmchunk_fp16.cu | 7 +- .../instantiation/chunkcumsum_bf16_bf16.cu | 7 +- .../instantiation/chunkcumsum_bf16_fp32.cu | 7 +- .../instantiation/chunkcumsum_fp16_fp16.cu | 7 +- .../instantiation/chunkcumsum_fp16_fp32.cu | 7 +- .../instantiation/chunkscan_bf16_bf16.cu | 7 +- .../instantiation/chunkscan_bf16_fp32.cu | 7 +- .../instantiation/chunkscan_fp16_fp16.cu | 7 +- .../instantiation/chunkscan_fp16_fp32.cu | 7 +- .../instantiation/chunkstate_bf16.cu | 7 +- .../instantiation/chunkstate_fp16.cu | 7 +- .../instantiation/statepassing_bf16.cu | 7 +- .../instantiation/statepassing_fp16.cu | 7 +- .../kernels/selectiveScan/selectiveScan.cu | 8 +- .../kernels/selectiveScan/selectiveScan.h | 8 +- .../kernels/selectiveScan/statepassing.h | 7 +- .../kernels/sparseAttentionKernels.cu | 8 +- .../kernels/sparseAttentionKernels.h | 8 +- .../kernels/speculativeDecoding/common.cu | 9 +- .../kernels/speculativeDecoding/common.h | 9 +- .../draftTokenTreeKernels.cu | 8 +- .../draftTokenTreeKernels.h | 8 +- .../eagleDecodingKernels.cu | 10 +- .../eagleDecodingKernels.h | 9 +- .../explicitDraftTokensKernels.cu | 10 +- .../explicitDraftTokensKernels.h | 9 +- .../externalDraftTokensKernels.cu | 10 +- .../externalDraftTokensKernels.h | 9 +- .../kvCacheUpdateKernels.cu | 9 +- .../kvCacheUpdateKernels.h | 9 +- .../medusaDecodingKernels.cu | 10 +- .../medusaDecodingKernels.h | 9 +- .../kernels/speculativeDecoding/mtpKernels.cu | 8 +- .../kernels/speculativeDecoding/mtpKernels.h | 9 +- cpp/tensorrt_llm/kernels/splitkGroupGemm.cu | 15 +- cpp/tensorrt_llm/kernels/splitkGroupGemm.h | 9 +- .../kernels/stopCriteriaKernels.cu | 8 +- .../kernels/stopCriteriaKernels.h | 8 +- cpp/tensorrt_llm/kernels/topkLastDim.cu | 8 +- cpp/tensorrt_llm/kernels/topkLastDim.h | 8 +- .../batchedGemm/KernelRunner.cpp | 10 +- .../batchedGemm/KernelRunner.h | 8 +- .../BatchedGemmInterface.h | 1 + .../trtllmGenKernels/blockScaleMoe/runner.cu | 8 +- .../trtllmGenKernels/blockScaleMoe/runner.h | 8 +- .../fmha/cubin/kernelMetaInfo.h | 5863 +---------------- .../fmha/cubin/kernelMetaInfo_cubin.cpp | 3 + .../trtllmGenKernels/fmha/fmhaKernels.h | 12 +- .../trtllmGenKernels/fmha/fmhaReduction.cu | 8 +- .../trtllmGenKernels/fmha/fmhaReduction.h | 8 +- .../trtllmGenKernels/fmha/fmhaRunner.cpp | 8 +- .../trtllmGenKernels/fmha/fmhaRunner.h | 8 +- .../trtllmGenKernels/fmha/fmhaRunnerParams.h | 8 +- .../trtllmGenKernels/fmha/kernelParams.h | 8 +- .../trtllmGenKernels/fmha/kernelUtils.h | 8 +- .../fmha/prepareCustomMask.cu | 8 +- .../trtllmGenKernels/fmha/prepareCustomMask.h | 8 +- .../trtllmGenKernels/gemm/KernelRunner.cpp | 8 +- .../trtllmGenKernels/gemm/KernelRunner.h | 8 +- .../gemmGatedAct/KernelRunner.cpp | 8 +- .../gemmGatedAct/KernelRunner.h | 8 +- .../kernels/unfusedAttentionKernels.cu | 8 +- .../kernels/unfusedAttentionKernels.h | 7 +- .../unfusedAttentionKernels_2_bf16_bf16.cu | 8 +- .../unfusedAttentionKernels_2_bf16_fp4.cu | 8 +- .../unfusedAttentionKernels_2_bf16_fp8.cu | 8 +- .../unfusedAttentionKernels_2_bf16_int8.cu | 8 +- .../unfusedAttentionKernels_2_float_float.cu | 8 +- .../unfusedAttentionKernels_2_float_fp8.cu | 8 +- .../unfusedAttentionKernels_2_float_int8.cu | 8 +- .../unfusedAttentionKernels_2_half_fp4.cu | 8 +- .../unfusedAttentionKernels_2_half_fp8.cu | 8 +- .../unfusedAttentionKernels_2_half_half.cu | 8 +- .../unfusedAttentionKernels_2_half_int8.cu | 8 +- .../unfusedAttentionKernels_2_template.h | 8 +- .../kernels/userbuffers/ipcsocket.cpp | 3 + .../kernels/userbuffers/ub_allocator.cpp | 2 +- .../kernels/userbuffers/ub_allocator.h | 5 +- .../kernels/userbuffers/ub_interface.cpp | 29 +- .../kernels/userbuffers/ub_interface.h | 15 +- .../kernels/userbuffers/userbuffers-host.cpp | 2 +- .../kernels/userbuffers/userbuffers.cu | 10 +- .../kernels/userbuffers/userbuffers.h | 12 +- .../kernels/weightOnlyBatchedGemv/common.h | 8 +- .../kernels/weightOnlyBatchedGemv/converter.h | 8 +- .../weightOnlyBatchedGemv/cudaCoreGemm.cu | 8 +- .../weightOnlyBatchedGemv/cudaCoreGemm.h | 8 +- .../cudaCoreGemmNVFP4.cu | 8 +- .../weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h | 8 +- .../kernels/weightOnlyBatchedGemv/details.h | 8 +- .../kernels/weightOnlyBatchedGemv/int8SQ.cu | 8 +- .../kernels/weightOnlyBatchedGemv/int8SQ.h | 8 +- .../kernels/weightOnlyBatchedGemv/kernel.h | 8 +- .../weightOnlyBatchedGemv/kernelDispatcher.h | 8 +- ...wiseColumnMajorInterleavedForHopperTrue.cu | 8 +- ...Int4GroupwiseColumnMajorInterleavedTrue.cu | 8 +- ...nnelColumnMajorInterleavedForHopperTrue.cu | 8 +- ...nt4PerChannelColumnMajorInterleavedTrue.cu | 8 +- ...pwiseColumnMajoInterleavedForHopperTrue.cu | 8 +- ...Int8GroupwiseColumnMajorInterleavedTrue.cu | 8 +- ...nnelColumnMajorInterleavedForHopperTrue.cu | 8 +- ...nt8PerChannelColumnMajorInterleavedTrue.cu | 8 +- ...wiseColumnMajorInterleavedForHopperTrue.cu | 8 +- ...Int4GroupwiseColumnMajorInterleavedTrue.cu | 8 +- ...nnelColumnMajorInterleavedForHopperTrue.cu | 8 +- ...nt4PerChannelColumnMajorInterleavedTrue.cu | 8 +- ...wiseColumnMajorInterleavedForHopperTrue.cu | 8 +- ...Int8GroupwiseColumnMajorInterleavedTrue.cu | 8 +- ...nnelColumnMajorInterleavedForHopperTrue.cu | 8 +- ...nt8PerChannelColumnMajorInterleavedTrue.cu | 8 +- .../weightOnlyBatchedGemv/kernelLauncher.h | 8 +- .../kernels/weightOnlyBatchedGemv/utility.h | 8 +- cpp/tensorrt_llm/kernels/xqaDispatcher.cpp | 9 +- cpp/tensorrt_llm/kernels/xqaDispatcher.h | 9 +- .../nanobind/userbuffers/bindings.cpp | 9 +- .../nanobind/userbuffers/bindings.h | 10 +- .../pybind/userbuffers/bindings.cpp | 9 +- .../pybind/userbuffers/bindings.h | 9 +- .../runtime/gptDecoderBatched.cpp | 2 +- .../thop/IndexerKCacheScatterOp.cpp | 6 +- cpp/tensorrt_llm/thop/IndexerTopKOp.cpp | 9 +- cpp/tensorrt_llm/thop/allgatherOp.cpp | 12 +- cpp/tensorrt_llm/thop/allreduceOp.cpp | 14 +- cpp/tensorrt_llm/thop/alltoallOp.cpp | 6 +- cpp/tensorrt_llm/thop/attentionOp.cpp | 6 +- cpp/tensorrt_llm/thop/attentionOp.h | 6 + cpp/tensorrt_llm/thop/causalConv1dOp.cpp | 8 +- .../convertSpecDecodingMaskToPackedMaskOp.cpp | 9 +- cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp | 10 +- cpp/tensorrt_llm/thop/cublasScaledMM.cpp | 8 +- cpp/tensorrt_llm/thop/cublasScaledMM.h | 6 + cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp | 6 +- cpp/tensorrt_llm/thop/cudaScaledMM.cpp | 6 +- cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp | 8 +- cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp | 20 +- cpp/tensorrt_llm/thop/cutlassScaledMM.cpp | 6 +- cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp | 6 +- cpp/tensorrt_llm/thop/dsv3RopeOp.cpp | 6 +- cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp | 6 +- cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp | 10 +- cpp/tensorrt_llm/thop/dynamicDecodeOp.h | 5 + .../finegrained_mixed_dtype_gemm_thop.cpp | 10 +- .../thop/finegrained_mixed_dtype_gemm_thop.h | 5 + cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp | 13 +- cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp | 6 +- cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp | 19 +- cpp/tensorrt_llm/thop/fp4Gemm.cpp | 14 +- cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp | 6 +- cpp/tensorrt_llm/thop/fp4Op.cpp | 26 +- cpp/tensorrt_llm/thop/fp4Quantize.cpp | 8 +- cpp/tensorrt_llm/thop/fp4Quantize.h | 6 + .../thop/fp4xFp8GemmTrtllmGen.cpp | 6 +- .../thop/fp8BatchedGemmTrtllmGen.cpp | 10 +- cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp | 10 +- cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp | 13 +- cpp/tensorrt_llm/thop/fp8Op.cpp | 29 +- cpp/tensorrt_llm/thop/fp8Op.h | 5 + .../thop/fp8PerTensorScaleMoe.cpp | 6 +- .../thop/fp8PerTensorScalingTrtllmGenGemm.cpp | 6 +- cpp/tensorrt_llm/thop/fp8Quantize.cpp | 8 +- cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp | 10 +- cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp | 4 + cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp | 6 +- cpp/tensorrt_llm/thop/gatherTreeOp.cpp | 6 +- cpp/tensorrt_llm/thop/groupRmsNormOp.cpp | 10 +- cpp/tensorrt_llm/thop/helixPostProcessOp.cpp | 4 + cpp/tensorrt_llm/thop/llama4MinLatency.cpp | 12 +- cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp | 6 +- cpp/tensorrt_llm/thop/loraOp.cpp | 6 +- cpp/tensorrt_llm/thop/mambaConv1dOp.cpp | 6 +- cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp | 12 +- cpp/tensorrt_llm/thop/moeAlignOp.cpp | 7 +- cpp/tensorrt_llm/thop/moeAlltoAllMeta.h | 6 + cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp | 17 +- cpp/tensorrt_llm/thop/moeCommOp.cpp | 18 +- cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp | 18 +- cpp/tensorrt_llm/thop/moeOp.cpp | 16 +- cpp/tensorrt_llm/thop/moeUtilOp.cpp | 8 +- cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp | 16 +- cpp/tensorrt_llm/thop/mxFp8Quantize.cpp | 6 +- cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp | 13 +- cpp/tensorrt_llm/thop/ncclCommunicatorOp.h | 5 + cpp/tensorrt_llm/thop/noAuxTcOp.cpp | 6 +- .../thop/parallelDecodeKVCacheUpdateOp.cpp | 6 +- cpp/tensorrt_llm/thop/redrafterCurandOp.cpp | 8 +- cpp/tensorrt_llm/thop/reducescatterOp.cpp | 12 +- .../thop/relativeAttentionBiasOp.cpp | 8 +- cpp/tensorrt_llm/thop/selectiveScanOp.cpp | 6 +- cpp/tensorrt_llm/thop/specDecOp.cpp | 16 +- cpp/tensorrt_llm/thop/thUtils.cpp | 4 + cpp/tensorrt_llm/thop/thUtils.h | 5 + cpp/tensorrt_llm/thop/tinygemm2.cpp | 6 +- .../thop/userbuffersFinalizeOp.cpp | 2 +- cpp/tensorrt_llm/thop/userbuffersTensor.cpp | 6 +- cpp/tensorrt_llm/thop/userbuffersTensor.h | 5 + cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp | 11 +- cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h | 5 + cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp | 36 +- .../unit_tests/multi_gpu/mpiUtilsTest.cpp | 2 + .../unit_tests/multi_gpu/ncclUtilsTest.cpp | 2 +- cpp/tests/unit_tests/thop/thUtilsTest.cpp | 2 +- scripts/build_wheel.py | 5 + .../_torch/custom_ops/cpp_custom_ops.py | 201 +- ...test_trtllm_flashinfer_symbol_collision.py | 83 + 621 files changed, 4168 insertions(+), 9576 deletions(-) create mode 100644 cpp/include/tensorrt_llm/common/config.h create mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp mode change 100755 => 100644 cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp mode change 100755 => 100644 cpp/tensorrt_llm/thop/ncclCommunicatorOp.h create mode 100644 tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py diff --git a/.gitattributes b/.gitattributes index 6f2d66838c..7b111ed877 100644 --- a/.gitattributes +++ b/.gitattributes @@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text +cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text +cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 712957ddd5..130ea9837b 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,7 @@ llm-test-workspace/ cpp/include/tensorrt_llm/executor/version.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h +cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp .devcontainer/.env /examples/layer_wise_benchmarks/profiles/ diff --git a/benchmarks/cpp/utils/utils.cpp b/benchmarks/cpp/utils/utils.cpp index 3a7c885c32..0cbcf1c046 100644 --- a/benchmarks/cpp/utils/utils.cpp +++ b/benchmarks/cpp/utils/utils.cpp @@ -1,6 +1,7 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & + *AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,13 +18,16 @@ */ #include "utils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include #include -namespace tensorrt_llm::benchmark +TRTLLM_NAMESPACE_BEGIN + +namespace benchmark { std::vector> parseVectorOfVectors(std::string const& input) @@ -98,7 +102,8 @@ Samples parseWorkloadJson( if (samples.size() < maxNumSamples) { TLLM_LOG_WARNING( - "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n", + "Dataset size %zu is smaller than given max_num_samples " + "%d, max_num_samples will be ignored.\n", samples.size(), maxNumSamples); } return samples; @@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric) return os; } -} // namespace tensorrt_llm::benchmark +} // namespace benchmark + +TRTLLM_NAMESPACE_END diff --git a/benchmarks/cpp/utils/utils.h b/benchmarks/cpp/utils/utils.h index 13e9fe1206..375a1cd9bf 100644 --- a/benchmarks/cpp/utils/utils.h +++ b/benchmarks/cpp/utils/utils.h @@ -16,6 +16,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/executor/executor.h" #include @@ -29,7 +30,9 @@ #pragma once -namespace tensorrt_llm::benchmark +TRTLLM_NAMESPACE_BEGIN + +namespace benchmark { // using namespace tensorrt_llm::batch_manager; @@ -237,4 +240,6 @@ std::vector generateRandomExponentialValues(int count, float lambda, int std::vector computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays); -} // namespace tensorrt_llm::benchmark +} // namespace benchmark + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/algorithm.h b/cpp/include/tensorrt_llm/common/algorithm.h index 9363504f75..9fcf7b2b4a 100644 --- a/cpp/include/tensorrt_llm/common/algorithm.h +++ b/cpp/include/tensorrt_llm/common/algorithm.h @@ -16,8 +16,9 @@ #pragma once -namespace tensorrt_llm -{ +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN // Base class for algorithms struct Algorithm @@ -29,4 +30,4 @@ struct Algorithm Algorithm& operator=(Algorithm const&) = delete; }; -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/arrayView.h b/cpp/include/tensorrt_llm/common/arrayView.h index 31dcd74532..ce4ceb9ed6 100644 --- a/cpp/include/tensorrt_llm/common/arrayView.h +++ b/cpp/include/tensorrt_llm/common/arrayView.h @@ -17,9 +17,13 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { //! @@ -100,4 +104,6 @@ private: size_type mSize; }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/assert.h b/cpp/include/tensorrt_llm/common/assert.h index 0e916b7746..d53630ab5d 100644 --- a/cpp/include/tensorrt_llm/common/assert.h +++ b/cpp/include/tensorrt_llm/common/assert.h @@ -16,14 +16,19 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/tllmException.h" +TRTLLM_NAMESPACE_BEGIN + class DebugConfig { public: static bool isCheckDebugEnabled(); }; +TRTLLM_NAMESPACE_END + #if defined(_WIN32) #define TLLM_LIKELY(x) (__assume((x) == 1), (x)) #define TLLM_UNLIKELY(x) (__assume((x) == 0), (x)) @@ -35,8 +40,8 @@ public: #define TLLM_CHECK(val) \ do \ { \ - TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ - : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + TLLM_LIKELY(static_cast(val)) \ + ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ } while (0) #define TLLM_CHECK_WITH_INFO(val, info, ...) \ @@ -51,17 +56,17 @@ public: #define TLLM_CHECK_DEBUG(val) \ do \ { \ - if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ + if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \ { \ - TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ - : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + TLLM_LIKELY(static_cast(val)) \ + ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ } \ } while (0) #define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \ do \ { \ - if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ + if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \ { \ TLLM_LIKELY(static_cast(val)) \ ? ((void) 0) \ diff --git a/cpp/include/tensorrt_llm/common/bindingUtils.h b/cpp/include/tensorrt_llm/common/bindingUtils.h index 83f72c676a..d61e1f7a14 100644 --- a/cpp/include/tensorrt_llm/common/bindingUtils.h +++ b/cpp/include/tensorrt_llm/common/bindingUtils.h @@ -17,9 +17,13 @@ #pragma once #include "c10/util/intrusive_ptr.h" +#include "tensorrt_llm/common/config.h" + #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // Adapted from pybind11's example implementation: @@ -69,4 +73,6 @@ c10::intrusive_ptr get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a return c10::intrusive_ptr::reclaim_copy(p); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/config.h b/cpp/include/tensorrt_llm/common/config.h new file mode 100644 index 0000000000..71b97f9ab5 --- /dev/null +++ b/cpp/include/tensorrt_llm/common/config.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#ifndef TRTLLM_CONFIG_H +#define TRTLLM_CONFIG_H + +/** + * \def TRTLLM_ABI_NAMESPACE + * This macro is used to open an implicitly inline namespace block for the ABI version. + * This macro can be overridden to change the ABI version. + * The default ABI version is _v1. + */ +#ifndef TRTLLM_ABI_NAMESPACE +#define TRTLLM_ABI_NAMESPACE _v1 +#endif + +#ifndef TRTLLM_ABI_NAMESPACE_BEGIN +#define TRTLLM_ABI_NAMESPACE_BEGIN \ + inline namespace TRTLLM_ABI_NAMESPACE \ + { +#endif + +#ifndef TRTLLM_ABI_NAMESPACE_END +#define TRTLLM_ABI_NAMESPACE_END } +#endif + +/** + * \def TRTLLM_NAMESPACE_BEGIN + * This macro is used to open a `tensorrt_llm::` namespace block, along with any + * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc. + * This macro is defined by TensorRT-LLM and may not be overridden. + */ +#define TRTLLM_NAMESPACE_BEGIN \ + namespace tensorrt_llm \ + { \ + TRTLLM_ABI_NAMESPACE_BEGIN + +/** + * \def TRTLLM_NAMESPACE_END + * This macro is used to close a `tensorrt_llm::` namespace block, along with any + * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc. + * This macro is defined by TensorRT-LLM and may not be overridden. + */ +#define TRTLLM_NAMESPACE_END \ + TRTLLM_ABI_NAMESPACE_END \ + } /* end namespace tensorrt_llm */ + +#endif // TRTLLM_CONFIG_H diff --git a/cpp/include/tensorrt_llm/common/cudaFp8Utils.h b/cpp/include/tensorrt_llm/common/cudaFp8Utils.h index 373aabc96c..75dae28eff 100644 --- a/cpp/include/tensorrt_llm/common/cudaFp8Utils.h +++ b/cpp/include/tensorrt_llm/common/cudaFp8Utils.h @@ -16,6 +16,8 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #ifdef ENABLE_FP8 #include #include @@ -29,8 +31,8 @@ #define USE_QGMMA #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream); } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END #endif // ENABLE_FP8 diff --git a/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h b/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h index 985f4619ee..4f369c0592 100644 --- a/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h @@ -14,12 +14,18 @@ * limitations under the License. */ +#pragma once + +#include "tensorrt_llm/common/config.h" + #include #include #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /// @brief Populate the start and end profiling iteration indexes from the provided environment variables @@ -28,4 +34,6 @@ namespace tensorrt_llm::common std::pair, std::unordered_set> populateIterationIndexes( std::string const& envVarName, std::optional const& legacyEnvVarName = std::nullopt); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h index 6626b18e38..3a11df85b1 100644 --- a/cpp/include/tensorrt_llm/common/cudaUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaUtils.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -49,7 +50,9 @@ // this undef. #endif // WIN32 -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // workspace for cublas gemm : 32MB @@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq) DEFINE_MEMBER_CHECKER(qua) DEFINE_MEMBER_CHECKER(high_preciecion_normed_output) -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END /* * Macros compliant with TensorRT coding conventions diff --git a/cpp/include/tensorrt_llm/common/dataType.h b/cpp/include/tensorrt_llm/common/dataType.h index 6c19322135..2f19404f9c 100644 --- a/cpp/include/tensorrt_llm/common/dataType.h +++ b/cpp/include/tensorrt_llm/common/dataType.h @@ -16,11 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/tllmException.h" + #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { constexpr static size_t getDTypeSize(nvinfer1::DataType type) @@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type) return ""; } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/logger.h b/cpp/include/tensorrt_llm/common/logger.h index c8164b10e5..5477415edf 100644 --- a/cpp/include/tensorrt_llm/common/logger.h +++ b/cpp/include/tensorrt_llm/common/logger.h @@ -22,9 +22,12 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { class Logger @@ -125,12 +128,12 @@ private: static inline std::string getPrefix(Level const level) { - return fmtstr("%s[%s] ", kPREFIX, getLevelName(level)); + return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level)); } static inline std::string getPrefix(Level const level, int const rank) { - return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank); + return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank); } }; @@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format, out << std::endl; } } +} // namespace common + +TRTLLM_NAMESPACE_END #define TLLM_LOG(level, ...) \ do \ @@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format, #define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__) #define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__) #define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__) -} // namespace tensorrt_llm::common diff --git a/cpp/include/tensorrt_llm/common/optionalRef.h b/cpp/include/tensorrt_llm/common/optionalRef.h index af93ac6d36..f55b377981 100644 --- a/cpp/include/tensorrt_llm/common/optionalRef.h +++ b/cpp/include/tensorrt_llm/common/optionalRef.h @@ -16,11 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /** @@ -100,4 +104,6 @@ public: } }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/quantization.h b/cpp/include/tensorrt_llm/common/quantization.h index 50aae114e0..df13a674d6 100644 --- a/cpp/include/tensorrt_llm/common/quantization.h +++ b/cpp/include/tensorrt_llm/common/quantization.h @@ -16,12 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -480,4 +482,5 @@ public: }; } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/stringUtils.h b/cpp/include/tensorrt_llm/common/stringUtils.h index a4803cba37..f4cf8a89be 100644 --- a/cpp/include/tensorrt_llm/common/stringUtils.h +++ b/cpp/include/tensorrt_llm/common/stringUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #if ENABLE_BF16 #include #endif // ENABLE_BF16 @@ -28,7 +29,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { #if ENABLE_BF16 static inline std::basic_ostream& operator<<(std::basic_ostream& stream, __nv_bfloat16 const& val) @@ -228,4 +231,6 @@ inline void toUpper(std::string& s) } } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/tllmException.h b/cpp/include/tensorrt_llm/common/tllmException.h index 9d222a0ca9..c705e1cf89 100644 --- a/cpp/include/tensorrt_llm/common/tllmException.h +++ b/cpp/include/tensorrt_llm/common/tllmException.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include @@ -41,7 +42,9 @@ tensorrt_llm::common::RequestSpecificException( \ __FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode) -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /// @brief Enumeration of different error codes for request-specific exceptions @@ -77,7 +80,8 @@ private: [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info) { - throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); + throw TllmException( + file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); } [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") @@ -102,4 +106,6 @@ private: RequestErrorCode mErrorCode; }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/utils.h b/cpp/include/tensorrt_llm/common/utils.h index 2a0ff72b53..22e6b628bb 100644 --- a/cpp/include/tensorrt_llm/common/utils.h +++ b/cpp/include/tensorrt_llm/common/utils.h @@ -16,6 +16,8 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include @@ -24,7 +26,9 @@ #include #endif -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { inline bool setThreadName(std::string const& name) @@ -43,4 +47,6 @@ bool contains(std::initializer_list const& c, T const& v) return std::find(c.begin(), c.end(), v) != c.end(); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/archCondition.h b/cpp/include/tensorrt_llm/kernels/archCondition.h index ef86d5745e..4d633d046b 100644 --- a/cpp/include/tensorrt_llm/kernels/archCondition.h +++ b/cpp/include/tensorrt_llm/kernels/archCondition.h @@ -16,7 +16,11 @@ #pragma once -namespace tensorrt_llm::kernels +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace detail @@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible::value; } // namespace arch -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/decodingCommon.h b/cpp/include/tensorrt_llm/kernels/decodingCommon.h index 116a85e2ee..aa7e2f961f 100644 --- a/cpp/include/tensorrt_llm/kernels/decodingCommon.h +++ b/cpp/include/tensorrt_llm/kernels/decodingCommon.h @@ -17,11 +17,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/executor/types.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class FinishedState @@ -308,4 +311,6 @@ template void invokeScatterDecodingParams( T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h index e664db6400..6f9c2c78a1 100644 --- a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h +++ b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h @@ -17,11 +17,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class KVCacheIndex @@ -53,4 +56,6 @@ private: UnderlyingType value; }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h index 0119d8948a..6a6ac75ffa 100644 --- a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h +++ b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h @@ -14,16 +14,18 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/iBuffer.h" using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads, unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 43a175ba80..75b4f2f56e 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -2175,7 +2175,8 @@ def get_kernel_code(kspec, kname, lname): params_str = 'reinterpret_cast(params)' if generate_cu_trtllm else 'params' attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;' bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;' - include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else '' + include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else '' + include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else '' num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;' fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}' const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}' @@ -2201,8 +2202,19 @@ def get_kernel_code(kspec, kname, lname): const int COMPUTE_REG_COUNT = {compute_reg_count}; asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format( compute_reg_count=compute_reg_count) - local_ns_open = ns_open if generate_cu_trtllm else '' - local_ns_close = ns_close if generate_cu_trtllm else '' + abi_ns_open = r""" +TRTLLM_NAMESPACE_BEGIN +namespace kernels +{ +// clang-format off +""" + abi_ns_close = r""" +// clang-format on +} // namespace kernels +TRTLLM_NAMESPACE_END +""" + local_ns_open = abi_ns_open if generate_cu_trtllm else '' + local_ns_close = abi_ns_close if generate_cu_trtllm else '' tmp = dict(locals(), **kspec._asdict()) @@ -3077,8 +3089,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None): def get_cubin_header(kernel_traits, specs_names): cubins = [] cubin_lens = [] + launchers = [] cubins_dict = {} cubin_lens_dict = {} + launchers_dict = {} for kspec, fname, lname, kname in specs_names: if generate_cu_trtllm and not use_cubin_header( kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype): @@ -3282,11 +3296,11 @@ def get_cubin_header(kernel_traits, specs_names): if generate_cu_trtllm and lname != 'nullptr': launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format( lname=lname) - if int(sm) in cubins_dict: - if launcher not in cubins_dict[int(sm)]: - cubins_dict[int(sm)].append(launcher) + if int(sm) in launchers_dict: + if launcher not in launchers_dict[int(sm)]: + launchers_dict[int(sm)].append(launcher) else: - cubins_dict[int(sm)] = [launcher] + launchers_dict[int(sm)] = [launcher] elif 'mhca' in kname: code = '''\ {{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\ @@ -3309,17 +3323,33 @@ def get_cubin_header(kernel_traits, specs_names): else: metadata_v2 = ',\n'.join(metadata_v2) # Add macros to only include needed cubins during compilation. - for sm in cubins_dict.keys(): + # Collect all SM versions from all dictionaries + all_sms = sorted( + set( + list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) + + list(launchers_dict.keys()))) + + for sm in all_sms: macro_begin = f"#ifndef EXCLUDE_SM_{sm}" macro_end = f"#endif\n" - cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end]) + + # Add cubin array declarations + if sm in cubins_dict: + cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end]) + + # Add cubin length declarations if sm in cubin_lens_dict: cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end]) + # Add launcher declarations + if sm in launchers_dict: + launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end]) + unroll_config_v1 = ',\n'.join(unroll_config_v1) unroll_config_v2 = ',\n'.join(unroll_config_v2) cubins = '\n'.join(cubins) cubin_lens = '\n'.join(cubin_lens) + launchers = '\n'.join(launchers) local_ns_open = ns_open local_ns_close = ns_close if generate_cu_trtllm else '}' launcher_line = ''' @@ -3431,7 +3461,157 @@ static const struct TestMetaV2 '''.format(**locals(), copyright=copyright) - return code + # Generate header content (.h file) + if "GENERATE_CUBIN" in os.environ: + header_content = '''\ +{copyright} +#pragma once + +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN +namespace kernels{{ + +struct FusedMultiHeadAttentionKernelMetaInfoV2 +{{ + Data_type mDataTypeIn; + Data_type mDataTypeOut; + unsigned int mS; + unsigned int mStepQ; + unsigned int mStepKV; + unsigned int mD; + unsigned int mDV; + unsigned int mSageBlockSizeQ; + unsigned int mSageBlockSizeK; + unsigned int mSageBlockSizeV; + unsigned int mSM; + const unsigned char* mCubin; + unsigned int mCubinSize; + const char* mFuncName; + unsigned int mSharedMemBytes; + unsigned int mThreadsPerCTA; + unsigned int mUnrollStep; + int mAttentionMaskType; + int mAttentionInputLayout; + bool mInterleaved; + bool mFlashAttention; + bool mWarpSpecialization; + bool mFP32Accumulation; + bool mAlibiSupported; + bool mTiled; + bool mEnableAttnLogitSoftcapping; + bool mReturnSoftmaxStats;{launcher_line} +}}; + +extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[]; +extern const int sMhaKernelMetaInfosV2Size; + +}} // namespace kernels +TRTLLM_NAMESPACE_END +'''.format(**locals(), copyright=copyright) + # Generate source content (.cpp file) + source_content = '''\ +{copyright} + +#include "tensorrt_llm/common/config.h" + +#include +#include +#include + +{local_ns_open} + +//--- Cubin Arrays +{cubins} + +//--- Cubin Lengths +{cubin_lens} + +{local_ns_close} + +using namespace tensorrt_llm::kernels; + +namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{ + +class Fused_multihead_attention_params_v2; +class Launch_params; + +//--- Kernel Launchers +{launchers} + +// FIXME: These are duplicated declarations, we should remove them in the future. +constexpr int32_t kSM_70 = 70; +constexpr int32_t kSM_72 = 72; +constexpr int32_t kSM_75 = 75; +constexpr int32_t kSM_80 = 80; +constexpr int32_t kSM_86 = 86; +constexpr int32_t kSM_89 = 89; +constexpr int32_t kSM_90 = 90; +constexpr int32_t kSM_100 = 100; +constexpr int32_t kSM_100f = 10100; +constexpr int32_t kSM_103 = 103; +constexpr int32_t kSM_120 = 120; +constexpr int32_t kSM_121 = 121; + +// FIXME: These are duplicated declarations, we should remove them in the future. +enum Data_type +{{ + DATA_TYPE_BOOL, + DATA_TYPE_FP16, + DATA_TYPE_FP32, + DATA_TYPE_INT4, + DATA_TYPE_INT8, + DATA_TYPE_INT32, + DATA_TYPE_BF16, + DATA_TYPE_E2M1, + DATA_TYPE_E4M3, + DATA_TYPE_E5M2 +}}; + +struct FusedMultiHeadAttentionKernelMetaInfoV2 +{{ + Data_type mDataTypeIn; + Data_type mDataTypeOut; + unsigned int mS; + unsigned int mStepQ; + unsigned int mStepKV; + unsigned int mD; + unsigned int mDV; + unsigned int mSageBlockSizeQ; + unsigned int mSageBlockSizeK; + unsigned int mSageBlockSizeV; + unsigned int mSM; + const unsigned char* mCubin; + unsigned int mCubinSize; + const char* mFuncName; + unsigned int mSharedMemBytes; + unsigned int mThreadsPerCTA; + unsigned int mUnrollStep; + int mAttentionMaskType; + int mAttentionInputLayout; + bool mInterleaved; + bool mFlashAttention; + bool mWarpSpecialization; + bool mFP32Accumulation; + bool mAlibiSupported; + bool mTiled; + bool mEnableAttnLogitSoftcapping; + bool mReturnSoftmaxStats;{launcher_line} +}}; + +extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{ +{metadata_v2} +}}; + +extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]); +}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels +'''.format(**locals(), copyright=copyright) + else: + # Non-GENERATE_CUBIN mode: use old behavior + header_content = code + source_content = None + + return header_content, source_content # This is used to add some kernels running in cubins for passing CI cases. @@ -3449,9 +3629,20 @@ def modify_cubin_header(cubin_header): return result target = "#ifndef EXCLUDE_SM_80" - addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[]; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;""" - result = add_kernel_line(result, target, addition) + addition_cubin_array = """ +#ifndef EXCLUDE_SM_80 +extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[]; +#endif +""" + addition_cubin_length = """ +#ifndef EXCLUDE_SM_80 +extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len; +#endif +""" + # Add cubin array and length into there corresponding sections. + result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array) + result = add_kernel_line(result, "//--- Cubin Lengths", + addition_cubin_length) def modify_kernel_line(result, target, new_line): lines = result.split('\n') @@ -3534,13 +3725,22 @@ def generate_files(specs_names): output = output.decode('utf-8').strip() # this gives: kname, smem bytes, threads_per_cta, loop_step kernel_traits = [traits.split() for traits in output.splitlines()] - cubin_header = get_cubin_header(kernel_traits, valid_specs_names) + # Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files + # To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header + cubin_header, cubin_source = get_cubin_header(kernel_traits, + valid_specs_names) if generate_cu_trtllm: - cubin_header = modify_cubin_header(cubin_header) + cubin_source = modify_cubin_header(cubin_source) + # Write fmha_cubin.h file with open('./generated/fmha_cubin.h', 'w') as f: f.write(cubin_header) + # Write fmha_cubin.cpp file (same directory as fmha_cubin.h file) + if cubin_source is not None: + with open('./generated/fmha_cubin.cpp', 'w') as f: + f.write(cubin_source) + def enumerate_hgmma_tma_kernels(specs, sm=90): specs.append( diff --git a/cpp/kernels/xqa/gen_cpp_header.py b/cpp/kernels/xqa/gen_cpp_header.py index 51417bc96a..9513b5d456 100755 --- a/cpp/kernels/xqa/gen_cpp_header.py +++ b/cpp/kernels/xqa/gen_cpp_header.py @@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/* */ #pragma once -namespace tensorrt_llm { +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN namespace kernels { ''' @@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}"; ''' TEMPLATE_EPILOGUE = '''} -} +TRTLLM_NAMESPACE_END + ''' D = defaultdict(list) diff --git a/cpp/kernels/xqa/gen_cubins.py b/cpp/kernels/xqa/gen_cubins.py index 2a284f834a..a345861fb7 100755 --- a/cpp/kernels/xqa/gen_cubins.py +++ b/cpp/kernels/xqa/gen_cubins.py @@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/* * See the License for the specific language governing permissions and * limitations under the License. */ -namespace tensorrt_llm -{ + +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN namespace kernels { // clang-format off @@ -96,7 +98,7 @@ namespace kernels cpp_file_suffex_text = R""" // clang-format on } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END """ cubin_meta_info_struct_prefix_text = R""" diff --git a/cpp/tensorrt_llm/common/assert.cpp b/cpp/tensorrt_llm/common/assert.cpp index eaaf662447..4211a9a049 100755 --- a/cpp/tensorrt_llm/common/assert.cpp +++ b/cpp/tensorrt_llm/common/assert.cpp @@ -27,7 +27,7 @@ bool initCheckDebug() } } // namespace -bool DebugConfig::isCheckDebugEnabled() +bool tensorrt_llm::DebugConfig::isCheckDebugEnabled() { static bool const debugEnabled = initCheckDebug(); return debugEnabled; diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index f4ae207321..5994021eb4 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -16,6 +16,7 @@ */ #include "attentionOp.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h index f6c78480b6..653b4d65e7 100644 --- a/cpp/tensorrt_llm/common/attentionOp.h +++ b/cpp/tensorrt_llm/common/attentionOp.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/common/quantization.h" @@ -36,7 +37,9 @@ #include #endif // ENABLE_MULTI_DEVICE -namespace tensorrt_llm::common::op +TRTLLM_NAMESPACE_BEGIN + +namespace common::op { class AttentionOp @@ -543,4 +546,6 @@ private: UniqPtrWNullCopy mMultiBlockSemaphores = {}; }; -} // namespace tensorrt_llm::common::op +} // namespace common::op + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp index f3e81defd3..5cbe1b30d3 100644 --- a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp +++ b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp @@ -16,6 +16,7 @@ #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasVersionCheck.h" #include #include @@ -24,8 +25,8 @@ #error CUDART_VERSION Undefined! #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t } // namespace common -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.h b/cpp/tensorrt_llm/common/cublasMMWrapper.h index 1ca1dbfee6..78a68204ea 100644 --- a/cpp/tensorrt_llm/common/cublasMMWrapper.h +++ b/cpp/tensorrt_llm/common/cublasMMWrapper.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -185,4 +186,4 @@ public: } // namespace common -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh index 0519251e6f..583c4991ea 100644 --- a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh +++ b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _ #endif // ENABLE_BF16 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END // Operator definitions intentionally in global namespace namespace diff --git a/cpp/tensorrt_llm/common/cudaBufferUtils.cuh b/cpp/tensorrt_llm/common/cudaBufferUtils.cuh index a5da5bbcae..aad5e83cbf 100644 --- a/cpp/tensorrt_llm/common/cudaBufferUtils.cuh +++ b/cpp/tensorrt_llm/common/cudaBufferUtils.cuh @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include @@ -28,8 +29,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { static __host__ __device__ int hash(int val) @@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer }; } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp index c754f39277..b961ef5042 100644 --- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp +++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp @@ -18,6 +18,7 @@ #if defined(_WIN32) #include + #define dllOpen(name) LoadLibrary("nv" name ".dll") #define dllClose(handle) FreeLibrary(static_cast(handle)) #define dllGetSym(handle, name) static_cast(GetProcAddress(static_cast(handle), name)) @@ -29,6 +30,7 @@ #endif // defined(_WIN32) #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/logger.h" #include @@ -36,7 +38,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::shared_ptr CUDADriverWrapper::getInstance() @@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters( return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h index cc3328993c..236be28fd2 100644 --- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h +++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h @@ -17,6 +17,7 @@ #ifndef CUDA_DRIVER_WRAPPER_H #define CUDA_DRIVER_WRAPPER_H +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { class CUDADriverWrapper @@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil } } -} // namespace tensorrt_llm::common +} // namespace common +TRTLLM_NAMESPACE_END /* * Macros compliant with TensorRT coding conventions */ diff --git a/cpp/tensorrt_llm/common/cudaFp8Utils.cu b/cpp/tensorrt_llm/common/cudaFp8Utils.cu index 06afb96b95..39616f100c 100644 --- a/cpp/tensorrt_llm/common/cudaFp8Utils.cu +++ b/cpp/tensorrt_llm/common/cudaFp8Utils.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { #ifdef ENABLE_FP8 @@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3); #endif // ENABLE_FP8 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp b/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp index 5576fe782f..959fa3e906 100644 --- a/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp +++ b/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/cudaProfilerUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/stringUtils.h" #include @@ -54,7 +55,9 @@ std::tuple, std::unordered_set> populateIte } // namespace -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::pair, std::unordered_set> populateIterationIndexes( @@ -81,4 +84,6 @@ std::pair, std::unordered_set> populateIter return std::make_pair(profileIterIdxs, stopIterIdxs); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh index a0463a3a49..157b561d4c 100644 --- a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh +++ b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh @@ -25,9 +25,10 @@ #if ENABLE_BF16 #include #endif +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace common { @@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val) #endif // ENABLE_FP8 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h index 9a466512e4..4115ac150f 100644 --- a/cpp/tensorrt_llm/common/customAllReduceUtils.h +++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h" @@ -25,7 +26,9 @@ using tensorrt_llm::kernels::AllReduceFusionOp; using tensorrt_llm::kernels::AllReduceStrategyType; -namespace tensorrt_llm::utils::customAllReduceUtils +TRTLLM_NAMESPACE_BEGIN + +namespace utils::customAllReduceUtils { constexpr size_t NUM_POINTERS_PER_RANK = 7; @@ -292,4 +295,6 @@ inline const std::unordered_map AllReduceBe {90, AllReduceBestStrategyTableSM90}, {100, AllReduceBestStrategyTableSM100}, }; -} // namespace tensorrt_llm::utils::customAllReduceUtils +} // namespace utils::customAllReduceUtils + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index 3dfeb91a9e..fc85975acb 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -16,6 +16,7 @@ */ #include "envUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/stringUtils.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::optional getIntEnv(char const* name) @@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy() return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index 6142781f6a..8a3af2458d 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -16,13 +16,16 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // Useful when you want to inject some debug code controllable with env var. std::optional getIntEnv(char const* name); @@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow(); bool getEnvEplbForceGdrcopy(); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/lamportUtils.cuh b/cpp/tensorrt_llm/common/lamportUtils.cuh index 4713d1a240..9e2f22d1a1 100644 --- a/cpp/tensorrt_llm/common/lamportUtils.cuh +++ b/cpp/tensorrt_llm/common/lamportUtils.cuh @@ -19,6 +19,7 @@ #ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH #define TRTLLM_CUDA_LAMPORT_UTILS_CUH +#include "tensorrt_llm/common/config.h" #include #include #include @@ -29,7 +30,9 @@ #include "tensorrt_llm/common/cudaTypeUtils.cuh" -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { constexpr uint16_t kNEGZERO_FP16 = 0x8000U; @@ -279,6 +282,7 @@ private: } }; -} // namespace tensorrt_llm::common +} // namespace common +TRTLLM_NAMESPACE_END #endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH diff --git a/cpp/tensorrt_llm/common/logger.cpp b/cpp/tensorrt_llm/common/logger.cpp index 2c2edb5af8..5daa79d92e 100644 --- a/cpp/tensorrt_llm/common/logger.cpp +++ b/cpp/tensorrt_llm/common/logger.cpp @@ -15,12 +15,15 @@ */ #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { Logger::Logger() @@ -70,4 +73,6 @@ Logger* Logger::getLogger() thread_local Logger instance; return &instance; } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/mathUtils.h b/cpp/tensorrt_llm/common/mathUtils.h index 1bad3a2c15..670923dc28 100644 --- a/cpp/tensorrt_llm/common/mathUtils.h +++ b/cpp/tensorrt_llm/common/mathUtils.h @@ -16,10 +16,11 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n) //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp b/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp index b490e2bcdb..8dcd6b1985 100644 --- a/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp +++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp @@ -14,11 +14,15 @@ * limitations under the License. */ #include "mcastDevMemUtils.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::common +using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory; + +TRTLLM_NAMESPACE_BEGIN + +namespace common { -using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory; namespace { @@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr) { return McastDevMemBufferRegistry::getInstance().findBuffer(ptr); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/mcastDevMemUtils.h b/cpp/tensorrt_llm/common/mcastDevMemUtils.h index def72dd044..50c7a48291 100644 --- a/cpp/tensorrt_llm/common/mcastDevMemUtils.h +++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.h @@ -15,13 +15,17 @@ */ #pragma once -// Avoid circular dependency +#include "tensorrt_llm/common/config.h" + namespace tensorrt_llm::runtime { class McastDeviceMemory; -} +} // namespace tensorrt_llm::runtime -namespace tensorrt_llm::common +// Avoid circular dependency +TRTLLM_NAMESPACE_BEGIN + +namespace common { using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory; // Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer! @@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf); // information. Thus a derived pointer cannot used as the key. McastDeviceMemory* findMcastDevMemBuffer(void* ptr); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/memoryUtils.cu b/cpp/tensorrt_llm/common/memoryUtils.cu index ff22bbb7c4..fc13db3096 100644 --- a/cpp/tensorrt_llm/common/memoryUtils.cu +++ b/cpp/tensorrt_llm/common/memoryUtils.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" @@ -25,8 +26,8 @@ #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -961,4 +962,5 @@ void calcAlignedPointers( } } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/memoryUtils.h b/cpp/tensorrt_llm/common/memoryUtils.h index 267c6015b2..f55e422631 100644 --- a/cpp/tensorrt_llm/common/memoryUtils.h +++ b/cpp/tensorrt_llm/common/memoryUtils.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers( } } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h index d128741e0a..8e5d2c9154 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.h +++ b/cpp/tensorrt_llm/common/ncclUtils.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" @@ -46,7 +47,9 @@ #include #endif -namespace tensorrt_llm::common::nccl_util +TRTLLM_NAMESPACE_BEGIN + +namespace common::nccl_util { //============================================================================== @@ -392,6 +395,8 @@ inline std::pair createNCCLWindowTensor( return std::make_pair(tensor, buffer); } -} // namespace tensorrt_llm::common::nccl_util +} // namespace common::nccl_util + +TRTLLM_NAMESPACE_END #endif // ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/common/nvtxUtils.h b/cpp/tensorrt_llm/common/nvtxUtils.h index 4891a612ba..07f063e913 100644 --- a/cpp/tensorrt_llm/common/nvtxUtils.h +++ b/cpp/tensorrt_llm/common/nvtxUtils.h @@ -25,10 +25,13 @@ #if defined(__clang__) #pragma clang diagnostic pop #endif +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::common::nvtx +TRTLLM_NAMESPACE_BEGIN + +namespace common::nvtx { inline nvtx3::color nextColor() { @@ -46,8 +49,9 @@ inline nvtx3::color nextColor() #endif } -} // namespace tensorrt_llm::common::nvtx +} // namespace common::nvtx +TRTLLM_NAMESPACE_END #define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \ ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name) #define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range) diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index 72d966e43d..3acdf54843 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -29,6 +29,7 @@ #include #include +TRTLLM_NAMESPACE_BEGIN #if ENABLE_MULTI_DEVICE std::unordered_map* getDtypeMap() @@ -378,3 +379,5 @@ std::shared_ptr getCublasLtHandle() }); return creator(); } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/opUtils.h b/cpp/tensorrt_llm/common/opUtils.h index cb5911fe10..3018a5da10 100644 --- a/cpp/tensorrt_llm/common/opUtils.h +++ b/cpp/tensorrt_llm/common/opUtils.h @@ -17,6 +17,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/workspace.h" @@ -37,7 +38,9 @@ #include #include -namespace tensorrt_llm::common::op +TRTLLM_NAMESPACE_BEGIN + +namespace common::op { // Write values into buffer @@ -178,7 +181,7 @@ struct hash // for testing only void const* getCommSessionHandle(); -} // namespace tensorrt_llm::common::op +} // namespace common::op inline bool isBuilding() { @@ -220,6 +223,8 @@ std::shared_ptr getComm(std::set const& group); std::shared_ptr getCublasHandle(); std::shared_ptr getCublasLtHandle(); +TRTLLM_NAMESPACE_END + #ifndef DEBUG #define PLUGIN_CHECK(status) \ diff --git a/cpp/tensorrt_llm/common/quantTypeUtils.cuh b/cpp/tensorrt_llm/common/quantTypeUtils.cuh index a228d3f9fc..bfe924b109 100644 --- a/cpp/tensorrt_llm/common/quantTypeUtils.cuh +++ b/cpp/tensorrt_llm/common/quantTypeUtils.cuh @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3> #endif // ENABLE_FP8 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh index 04af7e4ec5..485a4aedb4 100644 --- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh +++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh @@ -21,6 +21,7 @@ #else #include #endif +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include #include @@ -30,8 +31,8 @@ namespace cg = cooperative_groups; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input) } } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/safetensors.cpp b/cpp/tensorrt_llm/common/safetensors.cpp index d948e91146..9171f79e44 100644 --- a/cpp/tensorrt_llm/common/safetensors.cpp +++ b/cpp/tensorrt_llm/common/safetensors.cpp @@ -17,6 +17,7 @@ #include "safetensors.h" #include "nlohmann/json.hpp" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include #include @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::common::safetensors +TRTLLM_NAMESPACE_BEGIN + +namespace common::safetensors { using nvinfer1::DataType; @@ -164,4 +167,6 @@ std::shared_ptr ISafeTensor::open(char const* filename) { return std::make_shared(filename); } -} // namespace tensorrt_llm::common::safetensors +} // namespace common::safetensors + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/safetensors.h b/cpp/tensorrt_llm/common/safetensors.h index 3af8d959be..e31225f1be 100644 --- a/cpp/tensorrt_llm/common/safetensors.h +++ b/cpp/tensorrt_llm/common/safetensors.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::common::safetensors +TRTLLM_NAMESPACE_BEGIN + +namespace common::safetensors { class INdArray { @@ -58,4 +61,6 @@ public: virtual ~ISafeTensor() = default; }; -} // namespace tensorrt_llm::common::safetensors +} // namespace common::safetensors + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/stlUtils.h b/cpp/tensorrt_llm/common/stlUtils.h index 9cda9fa0d4..7b12fd6d34 100644 --- a/cpp/tensorrt_llm/common/stlUtils.h +++ b/cpp/tensorrt_llm/common/stlUtils.h @@ -16,12 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::common::stl_utils +TRTLLM_NAMESPACE_BEGIN + +namespace common::stl_utils { template @@ -120,4 +123,6 @@ std::string toString(std::optional const& t, typename std::enable_if_t #include @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args) @@ -73,4 +76,6 @@ std::unordered_set str2set(std::string const& input, char delimiter return values; }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/timestampUtils.cpp b/cpp/tensorrt_llm/common/timestampUtils.cpp index c00041abda..66c01fbd7a 100644 --- a/cpp/tensorrt_llm/common/timestampUtils.cpp +++ b/cpp/tensorrt_llm/common/timestampUtils.cpp @@ -14,13 +14,16 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include #include "tensorrt_llm/common/timestampUtils.h" -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::string getCurrentTimestamp() @@ -39,4 +42,6 @@ std::string getCurrentTimestamp() return stream.str(); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/timestampUtils.h b/cpp/tensorrt_llm/common/timestampUtils.h index f52f23028c..92a9c0e38f 100644 --- a/cpp/tensorrt_llm/common/timestampUtils.h +++ b/cpp/tensorrt_llm/common/timestampUtils.h @@ -14,12 +14,17 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu" std::string getCurrentTimestamp(); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/tllmException.cpp b/cpp/tensorrt_llm/common/tllmException.cpp index a6aaa5e259..1b71fe5572 100644 --- a/cpp/tensorrt_llm/common/tllmException.cpp +++ b/cpp/tensorrt_llm/common/tllmException.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/tllmException.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include @@ -26,7 +27,9 @@ #endif #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { namespace @@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept return mErrorCode; } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/workspace.h b/cpp/tensorrt_llm/common/workspace.h index 0dd32ed16d..c92d02fa9d 100644 --- a/cpp/tensorrt_llm/common/workspace.h +++ b/cpp/tensorrt_llm/common/workspace.h @@ -14,10 +14,13 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // CuBLAS >= 12.9.1 requires 256-byte alignment. @@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize( return total; } -}; // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h index c83a9a074d..c49cd09cdb 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h @@ -18,10 +18,11 @@ #include #include "cutlass/device_kernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace cutlass_extensions { @@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel() } } // namespace cutlass_extensions -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h index 032f411f17..c6326ef0fe 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h @@ -30,10 +30,11 @@ #include "cutlass/epilogue/thread/linear_combination_relu.h" #include "cutlass/epilogue/thread/linear_combination_silu.h" #include "cutlass_extensions/epilogue/thread/fused_activations.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace cutlass_extensions { @@ -150,4 +151,5 @@ struct Epilogue const& modelPathOpt, std::optional const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig, - ExecutorConfig const& executorConfig, bool isEncoder, + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder, std::optional> const& managedWeightsOpt) { auto const gpusPerNode = jsonConfig.getGpusPerNode(); @@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional const& model Executor::Impl::Impl(std::filesystem::path const& modelPath, std::optional const& encoderModelPath, ModelType const modelType, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json"); @@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath, Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr, std::optional const& encoderEngineBufferView, std::optional const& encoderJsonConfigStr, - ModelType const modelType, ExecutorConfig const& executorConfig, + ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional> const& managedWeightsOpt) { auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr); @@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json } Executor::Impl::Impl(std::shared_ptr model, std::optional> encoderModel, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto const& worldConfig = model->getWorldConfig(); auto const tp = worldConfig.getTensorParallelism(); @@ -388,7 +389,7 @@ Executor::Impl::~Impl() shutdown(); } -void Executor::Impl::initialize(ExecutorConfig const& executorConfig) +void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig) std::shared_ptr Executor::Impl::createModel(runtime::RawEngine const& rawEngine, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto const gptModelType = [&executorConfig, &modelConfig]() { @@ -512,7 +513,7 @@ std::shared_ptr Executor::Impl::createModel(runtime::RawEngine const& raw std::shared_ptr Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto fixedExecutorConfig = ExecutorConfig{}; fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig()); @@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm( } void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp, - ExecutorConfig const& executorConfig, std::optional modelType, + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional modelType, std::optional const& modelPath, std::optional const& worldConfig, std::optional const& decoderGptJsonConfig) { @@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig } void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp, - ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType, + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType, std::filesystem::path const& modelPath) { #if ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h b/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h index b0ac689d38..9e316c0b4e 100644 --- a/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h +++ b/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h @@ -16,9 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache, @@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca cudaStream_t stream = 0); } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/IndexerTopK.h b/cpp/tensorrt_llm/kernels/IndexerTopK.h index 546d18d7a4..e4c79a3f1b 100644 --- a/cpp/tensorrt_llm/kernels/IndexerTopK.h +++ b/cpp/tensorrt_llm/kernels/IndexerTopK.h @@ -17,12 +17,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux, int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0, @@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048, cudaStream_t const stream = 0); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/attentionMask.cu b/cpp/tensorrt_llm/kernels/attentionMask.cu index 64514a926a..a31b3e1ae7 100644 --- a/cpp/tensorrt_llm/kernels/attentionMask.cu +++ b/cpp/tensorrt_llm/kernels/attentionMask.cu @@ -15,6 +15,7 @@ */ #include "attentionMask.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const& //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/attentionMask.h b/cpp/tensorrt_llm/kernels/attentionMask.h index fcfafb3df7..f3a4bf62c7 100644 --- a/cpp/tensorrt_llm/kernels/attentionMask.h +++ b/cpp/tensorrt_llm/kernels/attentionMask.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/runtime/iTensor.h" @@ -25,8 +26,8 @@ namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -64,4 +65,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banBadWords.cu b/cpp/tensorrt_llm/kernels/banBadWords.cu index 53b55e8adc..c5f7799726 100644 --- a/cpp/tensorrt_llm/kernels/banBadWords.cu +++ b/cpp/tensorrt_llm/kernels/banBadWords.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/banBadWords.h" using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banBadWords.h b/cpp/tensorrt_llm/kernels/banBadWords.h index 1057c45911..39fa10fdba 100644 --- a/cpp/tensorrt_llm/kernels/banBadWords.h +++ b/cpp/tensorrt_llm/kernels/banBadWords.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu index 9011811b45..e2d06f857d 100644 --- a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu +++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/banRepeatNgram.h" using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -178,4 +179,4 @@ INVOKE_BAN_REPEAT_NGRAM(__nv_bfloat16) } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banRepeatNgram.h b/cpp/tensorrt_llm/kernels/banRepeatNgram.h index 8218331734..5541dc4bca 100644 --- a/cpp/tensorrt_llm/kernels/banRepeatNgram.h +++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeBanRepeatNgram(T* logits, runtime::TokenIdType const** output_ids_buf runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu index ff5f5347b4..005a153916 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -355,4 +356,5 @@ template void printLogProbs(float const* x, int const nBS, int const nBMI template void printLogProbs(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h index ebf41d7787..d8a9266e94 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK @@ -22,8 +23,8 @@ #define BEAM_SEARCH_DEBUG 0 -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { static size_t constexpr kMaxBeamWidth = 1024; // Max beam width supported in TRT-LLM now @@ -88,7 +89,7 @@ struct BeamHypotheses // Pointers related to beam search process, they are initialized in those two functions: // [gptDecoder.cpp] GptDecoder::forward or [dynamicDecodeOp.cpp] FtDynamicDecode::forward bool* batchDones{nullptr}; // [BS] %% self.beam_hyps_is_done whether a whole batch is finished - FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished + ::tensorrt_llm::kernels::FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished // Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer::prepareIdsPtrs int** outputIdsPtr{nullptr}; // [BS][BM, MSL] %% self.output_ids @@ -131,11 +132,11 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream); __global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, + ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); __global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, + ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS, @@ -219,4 +220,5 @@ void printLogProbs(float const* x, int const nBS, int const nBMIn, int const nBM #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu index 2d611b877f..4d60055585 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 1024, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu index c76929186c..bf23a844b9 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 128, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu index 698459cfa1..50bf27b142 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu @@ -15,13 +15,15 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { // Skip V1 kernels if beam_width > kMaxBeamWidthForV1 INSTANTIATE_BEAM_SEARCH(float, 16, true); INSTANTIATE_BEAM_SEARCH(half, 16, true); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu index 1ba2498129..fae7cd927e 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 256, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu index 9e7f528725..d414d268c0 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 32, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu index ce74250dbc..d1815d85e3 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { INSTANTIATE_BEAM_SEARCH(float, 4, false); @@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 4, true); INSTANTIATE_BEAM_SEARCH(half, 4, false); INSTANTIATE_BEAM_SEARCH(half, 4, true); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu index dd5f78a35f..005f44e5e7 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 512, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu index 65a43f9b4d..87a34b2d07 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 64, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu index e1161ddc6d..7b84b37050 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { INSTANTIATE_BEAM_SEARCH(float, 8, false); @@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 8, true); INSTANTIATE_BEAM_SEARCH(half, 8, false); INSTANTIATE_BEAM_SEARCH(half, 8, true); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h index 331590c526..6ae82e5ad8 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h @@ -18,11 +18,13 @@ #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/common/stringUtils.h" @@ -31,8 +33,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -731,4 +733,5 @@ void beamSearchKernelLauncher( T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu index 951492b5ff..398ea05260 100644 --- a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu +++ b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu @@ -14,12 +14,12 @@ * limitations under the License. */ +#include "buildRelativeAttentionBiasKernel.h" +#include "tensorrt_llm/common/config.h" #include -#include "buildRelativeAttentionBiasKernel.h" +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -99,4 +99,5 @@ template void invokeBuildRelativeAttentionBias<__nv_bfloat16>(__nv_bfloat16* rel #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h index 67f622345d..bdeea2b2af 100644 --- a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h +++ b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h @@ -17,10 +17,11 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -30,4 +31,5 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, T const* relat cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu index 39b8136d25..8ec6bbbf82 100644 --- a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu +++ b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu @@ -19,12 +19,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h" -namespace tensorrt_llm::kernels::causal_conv1d +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::causal_conv1d { template @@ -490,4 +493,6 @@ template void causal_conv1d_update_cuda(ConvParamsBase& params, cu template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::causal_conv1d +} // namespace kernels::causal_conv1d + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h index 53c9b042c4..2597ebbb30 100644 --- a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h +++ b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h @@ -20,11 +20,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include -namespace tensorrt_llm::kernels::causal_conv1d +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::causal_conv1d { #define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError()) @@ -214,4 +217,6 @@ void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::causal_conv1d +} // namespace kernels::causal_conv1d + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu index 785285bddd..2f6ac3fab7 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu @@ -13,13 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/kernels/quantization.cuh" #include -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { template struct SyncComm @@ -818,4 +821,6 @@ void allreduce_fusion_op(AllReduceFusionParams const& params) DISPATCH_RANKS(16); TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!"); } -}; // namespace tensorrt_llm::kernels::ar_fusion +}; // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h index 52487b25d4..1fc18c415d 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h @@ -15,16 +15,19 @@ */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/runtime/ipcUtils.h" -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { template struct ElemsPerAccess; @@ -139,4 +142,6 @@ struct AllReduceFusionParams }; void allreduce_fusion_op(AllReduceFusionParams const& params); -} // namespace tensorrt_llm::kernels::ar_fusion +} // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu index fc96dcc73f..3c4b4b5049 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu @@ -13,9 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h" -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { __global__ void lamport_initialize_kernel(float* ptr, int size) @@ -94,4 +97,6 @@ void** Workspace::get_workspace() { return reinterpret_cast(m_workspace); } -}; // namespace tensorrt_llm::kernels::ar_fusion +}; // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h index f72f94d296..055d29c3a0 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h @@ -16,11 +16,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/runtime/ipcUtils.h" -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { class Workspace @@ -41,4 +44,6 @@ private: }; void lamport_initialize(void* ptr, int bytes, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::ar_fusion +} // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu index 82c17119e2..f1d5c08bda 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { using tensorrt_llm::common::divUp; @@ -1632,4 +1635,6 @@ void customLowPrecisionAllReduce( sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h index f4df59fcf2..5fc87ef1a5 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h @@ -17,6 +17,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h" #include @@ -24,7 +25,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8; @@ -119,4 +122,6 @@ void customLowPrecisionAllReduce( kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream); int32_t max_workspace_size_lowprecision(int32_t tp_size); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu index 5a0727fcc3..47d4cf3736 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "mnnvlAllreduceKernels.h" +#include "tensorrt_llm/common/config.h" #include #include #include @@ -31,7 +32,9 @@ #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" -namespace tensorrt_llm::kernels::mnnvl +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::mnnvl { using tensorrt_llm::common::isNegZero; @@ -1029,4 +1032,6 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params) } } -} // namespace tensorrt_llm::kernels::mnnvl +} // namespace kernels::mnnvl + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h index 422b32a702..5361f50221 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h @@ -16,11 +16,13 @@ #ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H #define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels::mnnvl +TRTLLM_NAMESPACE_BEGIN +namespace kernels::mnnvl { /** @@ -66,6 +68,7 @@ struct AllReduceFusionParams void oneshotAllreduceFusionOp(AllReduceFusionParams const& params); void twoshotAllreduceFusionOp(AllReduceFusionParams const& params); -} // namespace tensorrt_llm::kernels::mnnvl +} // namespace kernels::mnnvl +TRTLLM_NAMESPACE_END #endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu index 7bc9e326fb..44a32f9a1f 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu @@ -13,13 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h" #include "tensorrt_llm/kernels/quantization.cuh" #include -namespace tensorrt_llm::kernels::ar_fusion::moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion::moe { template struct LamportComm @@ -770,4 +773,6 @@ void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& par #undef MOE_FINALIZE_DISPATCH1 } -}; // namespace tensorrt_llm::kernels::ar_fusion::moe +}; // namespace kernels::ar_fusion::moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h index 4a35d14bf0..556dd4e5cd 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h @@ -15,16 +15,19 @@ */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/runtime/ipcUtils.h" -namespace tensorrt_llm::kernels::ar_fusion::moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion::moe { static constexpr int kElemsPerAccess = 8; static constexpr int kOneShotMaxToken = 128; @@ -102,4 +105,6 @@ struct MoeFinalizeAllReduceFusionParams : public AllReduceFusionParams void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params); -} // namespace tensorrt_llm::kernels::ar_fusion::moe +} // namespace kernels::ar_fusion::moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index 62c25ce3ca..1ee535bdbd 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/vec_dtypes.cuh" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::moe_comm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::moe_comm { #define ENABLE_DEBUG_PRINT 0 @@ -1082,4 +1085,6 @@ void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv expert_ids, recv_counters, ep_size, max_tokens_per_rank, top_k, invalid_id); } -} // namespace tensorrt_llm::kernels::moe_comm +} // namespace kernels::moe_comm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h index 93e6508253..193a3806df 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h @@ -15,11 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm::kernels::moe_comm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::moe_comm { // Configuration constants @@ -176,4 +179,6 @@ void moe_a2a_prepare_combine_launch(MoeA2ACombineParams const& params); void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv_counters, int32_t invalid_id, int ep_size, int max_tokens_per_rank, int top_k, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::moe_comm +} // namespace kernels::moe_comm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu index 03cf00df6d..a80edde888 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu @@ -15,6 +15,7 @@ */ #include "fmhaPackedMask.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -286,4 +287,5 @@ template void invokeBuildPackedMask(PackedMaskParams<__nv_bfloat16> const&, cuda //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h index 4f4c286fee..205aee942f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" #include "tensorrt_llm/runtime/iTensor.h" @@ -25,8 +26,8 @@ namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -78,4 +79,5 @@ template void invokeBuildPackedMask(PackedMaskParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index e92838637a..13749d03e9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -15,6 +15,7 @@ */ #include "fmhaRunner.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/mathUtils.h" #include @@ -28,8 +29,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -738,4 +739,5 @@ bool FusedMHARunnerV2::isFmhaSupported() } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h index afa8eb949a..ab2c82a544 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h @@ -29,11 +29,12 @@ #include "fused_multihead_attention_common.h" #include "fused_multihead_attention_v2.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tmaDescriptor.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -102,4 +103,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h index c2c0c48d16..93002edeff 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h @@ -16,16 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" +#include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" +#include "tensorrt_llm/kernels/sparseAttentionKernels.h" #include "tmaDescriptor.h" #include #include -#include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" -#include "tensorrt_llm/kernels/sparseAttentionKernels.h" +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -518,4 +518,5 @@ struct Launch_params }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp index 7af9c4192a..ad133e6603 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp @@ -15,13 +15,17 @@ */ #include "fused_multihead_attention_v2.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include +#include #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -556,7 +560,9 @@ FusedMultiHeadAttentionXMMAKernelV2 const* getXMMAKernelsV2(Data_type inputType, { sm = kSM_120; } - return FusedMHAKernelFactoryV2::Get().getXMMAKernels(sMhaKernelMetaInfosV2, - sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]), inputType, outputType, sm); + return FusedMHAKernelFactoryV2::Get().getXMMAKernels( + sMhaKernelMetaInfosV2, sMhaKernelMetaInfosV2Size, inputType, outputType, sm); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h index 3dc1a6110c..54241f67c9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h @@ -21,6 +21,7 @@ #include "cubin/fmha_cubin.h" #include "cuda_runtime_api.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tmaDescriptor.h" @@ -33,7 +34,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -153,4 +156,6 @@ using FusedMHAKernelFactoryV2 = TFusedMHAKernelFactory +#include +#include + +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -500,4 +506,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cumsumLastDim.cu b/cpp/tensorrt_llm/kernels/cumsumLastDim.cu index 8989e95fcf..100635c68f 100644 --- a/cpp/tensorrt_llm/kernels/cumsumLastDim.cu +++ b/cpp/tensorrt_llm/kernels/cumsumLastDim.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include "cumsumLastDim.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -170,4 +171,5 @@ INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(__nv_bfloat16); #undef INSTANTIATE_CUMSUM_LastDim_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cumsumLastDim.h b/cpp/tensorrt_llm/kernels/cumsumLastDim.h index 2266f685eb..7045ec3c19 100644 --- a/cpp/tensorrt_llm/kernels/cumsumLastDim.h +++ b/cpp/tensorrt_llm/kernels/cumsumLastDim.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { using SizeType32 = tensorrt_llm::runtime::SizeType32; @@ -34,4 +35,5 @@ void invokeCumsumLastDim(SizeType32 batchSize, SizeType32 inputLength, void cons void* __restrict__ output, void* workspace, size_t tempStorageBytes, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu index 39911eac61..d5633b2cce 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu @@ -15,6 +15,7 @@ */ #include "customAllReduceKernels.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" @@ -26,7 +27,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { using tensorrt_llm::common::divUp; @@ -2014,4 +2017,6 @@ void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, c sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h index c96a1b3064..06b5a281fb 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h @@ -16,15 +16,18 @@ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { constexpr size_t WARP_SIZE = 32; @@ -192,4 +195,6 @@ namespace reduce_fusion bool is_lamport_supported(nvinfer1::DataType dataType, int token_num, int hidden_size); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu index 59f3a67f13..a767cfccda 100644 --- a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu @@ -15,6 +15,7 @@ */ #include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/archCondition.h" @@ -29,7 +30,9 @@ namespace cg = cooperative_groups; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static constexpr int BLOCK_SIZE = 1024; @@ -284,4 +287,6 @@ INSTANTIATE_RENORM_MOE_ROUTING(half, __nv_bfloat16, int32_t, true); INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, int32_t, true); #endif -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h index 500889c0e5..f8240b4363 100644 --- a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h @@ -16,14 +16,19 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens, int64_t const numExperts, int64_t const topK, cudaStream_t const stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu index 3fa5fae3af..27958a8671 100644 --- a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu +++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/cuteDslKernels/moeUtils.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::kernels::cute_dsl +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cute_dsl { namespace { @@ -557,4 +560,6 @@ INSTANTIATE_MOE_ACTIVATION(__nv_bfloat16, __nv_fp4_e2m1, uint8_t); #endif #undef INSTANTIATE_MOE_ACTIVATION -} // namespace tensorrt_llm::kernels::cute_dsl +} // namespace kernels::cute_dsl + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h index 2bd356e3b0..fb84769fd9 100644 --- a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h +++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h @@ -15,11 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h" #include #include -namespace tensorrt_llm::kernels::cute_dsl +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cute_dsl { template void moePermute(InputType const* input, InputType* permuted_output, SFType const* input_sf, SFType* permuted_sf, @@ -44,4 +47,6 @@ void moeActivation(InputType const* input, OutputType* output, float const* glob cutlass_kernels::ActivationParams activation_params, int32_t const max_num_permuted_tokens, int32_t const interm_size, int32_t const tile_size, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::cute_dsl +} // namespace kernels::cute_dsl + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h index a4be82607a..8ea96d0b6a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h @@ -29,12 +29,15 @@ #include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/kernel/tile_scheduler.hpp" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/ipcNvlsMemory.h" using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { ////////////////////////////////////////////// // Sm100 Two-shot fusion @@ -374,4 +377,6 @@ private: cutlass::KernelHardwareInfo _hw_info; }; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h index fb446b451d..97bfea0f79 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h @@ -37,12 +37,15 @@ #include "./epilogue/sm90_visitor_allreduce_tma_warpspecialized.hpp" #include "./kernel/sm90_gemm_allreduce_tma_warpspecialized_pingpong.hpp" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/ipcNvlsMemory.h" using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { ////////////////////////////////////////////// // Sm90 Two-shot fusion @@ -322,4 +325,6 @@ private: cutlass::KernelHardwareInfo _hw_info; }; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu index 33f6c61882..2bca57c229 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu @@ -15,13 +15,17 @@ */ #include "./allreduce_gemm_impl_sm100.h" #include "./allreduce_gemm_impl_sm90.h" + +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "cutlass/bfloat16.h" #include "cutlass/float8.h" #include "cutlass/half.h" -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { ///////////////////////////////////////////////// // GemmAllReduce implementation specializations @@ -292,4 +296,6 @@ template class GemmAllReduceImplRunner>; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index 1283d8936e..028effc68f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #ifdef __GNUC__ // Check if the compiler is GCC or Clang @@ -36,8 +37,8 @@ using namespace tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -693,4 +694,5 @@ CutlassGemmConfig estimate_best_config_from_occupancies(std::vector( } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h index b12fd73724..f18b630767 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -73,4 +74,5 @@ void symmetric_quantize(int8_t* processed_quantized_weight, int8_t* unprocessed_ } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h index 411013aa26..dbbed4e08c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "cutlass/half.h" @@ -30,8 +31,8 @@ #include "cutlass/float_subbyte.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -163,4 +164,5 @@ struct CutlassToTllmTypeAdapter } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu index cbf33a9ce5..f4f4e40c01 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu @@ -15,9 +15,10 @@ */ #include "fp4_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -84,4 +85,5 @@ template class CutlassFp4GemmRunner<__nv_bfloat16, FP4GemmType::W4A8_MXFP4_MXFP8 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu index 0b232fb95b..71453157a5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu @@ -15,9 +15,10 @@ */ #include "fp4_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -81,4 +82,5 @@ template class CutlassFp4GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu index d733c97f6b..e187080938 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu @@ -15,9 +15,10 @@ */ #include "fp4_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -81,4 +82,5 @@ template class CutlassFp4GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h index 25cd88b478..003dcb9bb3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h @@ -39,11 +39,13 @@ #include "mxfp8_mxfp4_gemm_template_sm100.h" #include "nvfp4_nvfp4_gemm_template_sm100.h" #include "nvfp4_nvfp4_gemm_template_sm120.h" + +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -527,4 +529,5 @@ size_t CutlassFp4GemmRunner::getWorkspaceSize( } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h index 4191b337fe..3970563bc1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h @@ -29,6 +29,7 @@ #include "cutlass/gemm/collective/collective_builder.hpp" #include "cutlass/gemm/gemm.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/archCondition.h" @@ -41,8 +42,8 @@ using namespace cute; using namespace tensorrt_llm::kernels::cutlass_kernels; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -291,4 +292,5 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const* } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h index 720e62064d..277a16aa1b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h @@ -29,17 +29,17 @@ #include "cutlass/gemm/collective/collective_builder.hpp" #include "cutlass/gemm/gemm.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/archCondition.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" -#include "tensorrt_llm/common/envUtils.h" - #ifndef _WIN32 #pragma GCC diagnostic pop #endif // #ifndef _WIN32 -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -329,4 +329,5 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h index d9eeda8476..eaa3378acb 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h @@ -30,17 +30,17 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/util/packed_stride.hpp" -#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" - +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" #ifndef _WIN32 #pragma GCC diagnostic pop #endif // #ifndef _WIN32 -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -259,4 +259,5 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B, } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu index d234ef8b75..e8552e21f0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu @@ -16,9 +16,12 @@ #include "fp8_blockscale_gemm.h" #include "fp8_blockscale_gemm_kernel.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { template @@ -310,4 +313,6 @@ template class CutlassFp8BlockScaleGemmRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv template class CutlassFp8BlockScaleGemmRunner<__nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>; template class CutlassFp8BlockScaleGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>; -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h index 29a954ac11..b178c1a1b8 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h @@ -15,13 +15,18 @@ */ #pragma once + +#include "tensorrt_llm/common/config.h" + #include #include #include #include // non-persistent-cooperative GEMM -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { class CutlassFp8BlockScaleGemmRunnerInterface @@ -146,4 +151,6 @@ private: int64_t expected_m_ = 0; }; -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh index 7f95456fb0..e50f2915f2 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh @@ -31,10 +31,13 @@ #include "ada_blockwise_gemm/sm89_fp8_gemm_1d1d.cuh" #include "fp8_blockscale_mma_utils.cuh" #include "fp8_blockscale_tma_utils.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/deep_gemm/fp8_gemm.cuh" +TRTLLM_NAMESPACE_BEGIN + namespace kernel_utils { @@ -154,7 +157,7 @@ __inline__ __device__ uint32_t elect_one_sync([[maybe_unused]] int lane_id) } // namespace kernel_utils -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +namespace kernels::fp8_blockscale_gemm { template @@ -1960,4 +1963,6 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma } } -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh index 3282f2750c..9b7e9ceb4f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh @@ -15,10 +15,15 @@ */ #pragma once + +#include "tensorrt_llm/common/config.h" + #include #include -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { struct SM90_64x16x32_F32E4M3E4M3_SS @@ -610,4 +615,6 @@ struct Fp8MmaSelector using Type = decltype(select_type()); }; -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh index 06cff88ad6..a256c09b4a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh @@ -15,6 +15,9 @@ */ #pragma once + +#include "tensorrt_llm/common/config.h" + #include #include #include @@ -24,7 +27,9 @@ #include #include -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { template @@ -138,4 +143,6 @@ __device__ uint64_t mbarrier_arrive_1_expect_tx_cta(void* smem_ptr, uint32_t tx_ return state; } -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h index 7d0816e2eb..3ffe0d317a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h @@ -17,6 +17,7 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include @@ -25,8 +26,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -85,4 +86,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu index a1fcb7a5f6..25064c93c5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu @@ -15,9 +15,10 @@ */ #include "fp8_rowwise_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFp8RowwiseGemmRunner<__nv_bfloat16>; #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu index 83582db603..6f9623c39d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu @@ -15,9 +15,10 @@ */ #include "fp8_rowwise_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFp8RowwiseGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h index f41637d4ed..68a4066a4f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -43,7 +44,9 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { using namespace cute; @@ -177,4 +180,6 @@ struct DeviceGemmFp8RowwiseSm100 using Gemm = typename cutlass::gemm::device::GemmUniversalAdapter; }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h index ea94e6a9b2..468a528cff 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" // clang-format off #include "cutlass/cutlass.h" @@ -35,8 +36,8 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -132,4 +133,5 @@ struct DeviceGemmFp8RowwiseSm89 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h index 7852e36f3f..4939879761 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h @@ -26,6 +26,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -49,8 +50,8 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -196,4 +197,5 @@ struct DeviceGemmFp8RowwiseSm90 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h index 3c095421ba..0d601060ee 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h @@ -26,6 +26,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -49,8 +50,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -865,4 +866,5 @@ size_t CutlassFp8RowwiseGemmRunner::getWorkspaceSize(int const m, int const n } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu index e4783fdefd..d3e1a79b35 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu index 8934a2c0df..c3cbcf6ab6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu index b3fa996a87..12c95f73ee 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu index 064e4dbde9..dbcc199193 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu index 0dbdfabe0a..e87751fbad 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightO #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu index 6701d0637e..5d8b9a37c7 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightO #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu index ce57833187..dced9c13ba 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu index 7cef1a1272..9de8362de0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu index 66644fcfde..4ce228abc0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation Type* #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu index 392e2e763b..74341a215d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation Type* #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu index e40dd578cf..59d3be75ca 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation Type*/ #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu index 45e0f4c0f8..74fe659257 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -26,4 +27,5 @@ template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu index 113c6c6174..de1189ce34 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu index 6e69985edc..bb41afea9e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu index 51e33974f7..b643e8a043 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu index 148cfb519e..3f6cd93988 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu index 35d199f58f..ccc45aa8c1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h index de0c9c61bb..3b30dc77d2 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h @@ -19,13 +19,14 @@ #include "../include/common.h" #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include #include namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -133,4 +134,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index 360da97532..1ebaecaa11 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -22,6 +22,7 @@ #include "cutlass/gemm/kernel/default_gemm.h" #include "cutlass_extensions/compute_occupancy.h" #include "cutlass_extensions/gemm/device/gemm_universal_base_compat.h" +#include "tensorrt_llm/common/config.h" #include "cutlass_extensions/epilogue_helpers.h" #include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h" @@ -44,8 +45,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -584,4 +585,5 @@ CutlassFpAIntBGemmRunner -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -36,4 +37,5 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl index 94bf6c9648..06f89bf5fd 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl @@ -41,14 +41,15 @@ #endif // __GNUC__ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -298,4 +299,5 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h index 6e670d2d33..42b2dcae58 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h @@ -17,6 +17,7 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include @@ -25,8 +26,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -85,4 +86,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h index 07a8b45096..743cb11b2a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -42,8 +43,8 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -118,4 +119,5 @@ struct DeviceGemmGatedSm90 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h index ce175160a9..d5d8c43233 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -41,8 +42,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -446,4 +447,5 @@ size_t CutlassFusedGatedGemmRunner::getWorkspaceSize(int const m, int const n } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu index 2e603cfb15..6a75517567 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu @@ -15,9 +15,10 @@ */ #include "fused_gated_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFusedGatedGemmRunner<__nv_fp8_e4m3>; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h index 93068447eb..d7c8234839 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -25,7 +26,9 @@ #include "cutlass_extensions/gemm_configs.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { using namespace cute; using namespace tensorrt_llm::cutlass_extensions; @@ -248,4 +251,6 @@ private: std::map mGemmRegistry; }; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h index d6e5c38c10..8a9937c620 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h @@ -16,7 +16,11 @@ #pragma once -namespace tensorrt_llm::kernels::cutlass_kernels +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { // IMPORTANT: Keep the same order of activation functions in this enum and the activation functions in @@ -34,4 +38,6 @@ enum class ActivationType Relu2 = 8, }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h index 94318f2e62..944dbc0227 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -25,8 +26,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -97,4 +98,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h index b3e3aafef9..57d59a52a0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h @@ -17,17 +17,14 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include -// namespace tk = tensorrt_llm::common; +TRTLLM_NAMESPACE_BEGIN -namespace tkc = tensorrt_llm::cutlass_extensions; - -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -126,4 +123,4 @@ private: }; // namespace cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h index aef897c2e9..a2b7c112bd 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -35,7 +36,9 @@ #include #endif -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template @@ -336,4 +339,6 @@ private: size_t calcMaxWorkspaceSize(int num_experts) const; }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h index 1f01636217..c4f3fe61f3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h @@ -25,6 +25,7 @@ #ifdef ENABLE_FP4 #include #endif +#include "tensorrt_llm/common/config.h" #include #include #include @@ -33,7 +34,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { // Change to following declarations must sync with lora.h in public repo class LoraImpl; @@ -1016,4 +1019,6 @@ private: void populateRandomBuffer(void* buffer_void, size_t size, cudaStream_t stream); } // namespace cutlass_kernels -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h index a169bccf20..e902e2c9d6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h @@ -18,6 +18,7 @@ #include "./moe_gemm_kernels.h" #include "cutlass/gemm/gemm.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" @@ -32,7 +33,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace cutlass_kernels @@ -71,4 +74,6 @@ void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_ro cudaStream_t stream); } // namespace cutlass_kernels -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h index 722f817dbb..2de80db507 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h @@ -17,15 +17,17 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" + #include #include namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -91,4 +93,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu index a3633bc099..99c940751e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -29,4 +30,5 @@ template class CutlassInt8GemmRunner<__nv_bfloat16>; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu index 7189956d5d..a1ec5d8d09 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassInt8GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu index 861a2d4ff0..5f0c38eeb5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassInt8GemmRunner; // for compilation only } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu index 6814b00e02..f8511d7d0b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassInt8GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h index 1f5fedc6fa..b542b0ab32 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h @@ -40,6 +40,7 @@ #pragma GCC diagnostic pop #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" @@ -51,8 +52,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -383,4 +384,5 @@ size_t CutlassInt8GemmRunner::getWorkspaceSize(int const m, int const n, int } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h index 2395650223..6b14af0fd1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h @@ -51,6 +51,7 @@ #pragma GCC diagnostic pop #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -64,8 +65,7 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -554,4 +554,4 @@ std::vector CutlassLowLatencyFp8GemmRunner::getConfigs() const }; // namespace cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu index b58d5a1731..edd990c94c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu @@ -15,9 +15,10 @@ */ #include "fp8_low_latency_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassLowLatencyFp8GemmRunner<__nv_bfloat16>; // for compilation } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu index 2a9e07721f..98017f5930 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu @@ -15,9 +15,10 @@ */ #include "fp8_low_latency_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassLowLatencyFp8GemmRunner; // for compilation only } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu index a29b4e9bad..66dfb2596b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu @@ -15,9 +15,10 @@ */ #include "fp8_low_latency_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassLowLatencyFp8GemmRunner; // for compilation only } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h index efc7d359f8..49cd2ea262 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h @@ -14,7 +14,11 @@ * limitations under the License. */ -namespace tensorrt_llm::kernels::cutlass_kernels_oss +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { template @@ -22,4 +26,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe ElementType_ const* biases, bool bias_is_broadcast, ElementType_* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy); -} +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl index 85c2f00a54..2d112fb44c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl @@ -25,9 +25,12 @@ #include "cutlass_extensions/epilogue_helpers.h" #include "cutlass_extensions/gemm/kernel/fused_moe_kernel.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { template @@ -93,4 +96,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe auto result = cudaGetLastError(); TLLM_CHECK_WITH_INFO(result == cudaSuccess, "Fail to execute fused moe kernel, cuda error %d\n", (int) (result)); } -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h index 87fa89373e..77b809d0f0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h @@ -17,9 +17,12 @@ #pragma once #include "../../include/moe_gemm_kernels.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; // Keep in sync with the signature generated by generate_kernels.py @@ -31,4 +34,6 @@ void tma_warp_specialized_generic_moe_gemm_kernelLauncher(TmaWarpSpecializedGrou cute::Shape dynamic_cluster_shape, cute::Shape fallback_cluster_shape); -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl index e8f61e300a..56552a484b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl @@ -36,6 +36,7 @@ #include "cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -55,8 +56,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -709,4 +710,5 @@ using namespace cutlass::epilogue; } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h index 2b6b3a81cd..f2d6bcfa3e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h @@ -17,10 +17,11 @@ #include "../../include/moe_gemm_kernels.h" #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -36,4 +37,5 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher( } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl index 528c3584a6..86e61c56b2 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl @@ -54,6 +54,7 @@ #endif // __GNUC__ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -61,8 +62,8 @@ #include "moe_gemm_tma_ws_mixed_input_launcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -246,4 +247,5 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu index be29019bc6..5e090906c0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu index 69ea5c6326..40d5b3e68c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu index cbb8dba108..50480e1f2e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu index e642d785dc..e129d569fe 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, uint8_t, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu index a47b9f18a9..4e4f87d344 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu index f1a885ea77..9afe0dda88 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu index 234fcc81ae..f8de82e5b1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu index 5448f53271..e8cd6f186e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu index 3f858564cf..01d8c736a7 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu index 5c6222f3b4..449e9eec0e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP4 template class MoeGemmRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>; @@ -24,4 +27,6 @@ template class MoeGemmRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>; template class MoeGemmRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu index 1238517077..0ebaacdba3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP4 template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>; @@ -24,4 +27,6 @@ template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>; template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu index 9d86df55fc..2ab4ac4f89 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP8 template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, half>; @@ -25,4 +28,6 @@ template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>; #endif // template class MoeGemmRunner<__nv_fp8_e5m2, __nv_fp8_e5m2>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu index 812f909493..f749ca9263 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP8 template class MoeGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half>; @@ -24,4 +27,6 @@ template class MoeGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half>; template class MoeGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h index 95b55e3d84..33ece54627 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h @@ -53,6 +53,7 @@ #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" @@ -73,7 +74,9 @@ #include #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { // ============================= Variable batched Gemm things =========================== @@ -473,9 +476,9 @@ void dispatchMoeGemmToCutlass(GroupedGemmInput @@ -967,4 +970,6 @@ void MoeGemmRunner::moeGemm( runGemm(inputs, hopper_inputs); } -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h index 65fff6a285..339f95a96d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h @@ -51,6 +51,7 @@ #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" @@ -65,7 +66,9 @@ #include #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion; @@ -382,7 +385,7 @@ void dispatchMoeGemmSelectClusterShapeTmaWarpSpecialized(TmaWarpSpecializedGroup #undef SHAPE_CASE default: TLLM_THROW("Unsupported cluster shape config %d for MoE gemm.", (int) gemm_config.cluster_shape); } -} // namespace tensorrt_llm +} template void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmInput hopper_input, int num_experts, @@ -511,4 +514,6 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecialized(int num_experts, cutlass_extension return count; } -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h index 1ee7232c9e..c4265766b4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h @@ -49,6 +49,7 @@ #include "../include/moe_gemm_kernels.h" #include "launchers/moe_gemm_tma_ws_mixed_input_launcher.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -57,7 +58,9 @@ #include #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput; @@ -244,4 +247,6 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_ return count; } -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu index 59cf79f136..fd3ef0aac6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu @@ -15,6 +15,7 @@ */ #include "../include/moe_gemm_kernels.h" +#include "tensorrt_llm/common/config.h" #include "cutlass/cutlass.h" @@ -25,7 +26,9 @@ #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { std::array TmaWarpSpecializedGroupedGemmInput::workspaceBuffers( int num_experts, FpXBlockScalingType scaling_type) @@ -166,4 +169,6 @@ std::string TmaWarpSpecializedGroupedGemmInput::toString() const return ss.str(); } -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu index 76c7c58586..32332ec325 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/workspace.h" #include @@ -71,7 +72,9 @@ using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { /** * Takes the input maps and prepares the expanded maps for min latency @@ -4747,4 +4750,6 @@ template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh index 0a752f7b1f..36e271228d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh @@ -15,11 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "cutlass/epilogue/thread/activation.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { // ============================== Activation Adaptors ================================= @@ -72,4 +75,6 @@ struct SwigluBiasAdaptor } }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h index a662030ac2..a96a43a964 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h @@ -19,12 +19,15 @@ #include "../include/moe_gemm_kernels.h" #include "cutlass/arch/mma_sm90.h" #include "cutlass_extensions/epilogue_helpers.h" +#include "tensorrt_llm/common/config.h" #ifdef ENABLE_FP4 #include #endif -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { // Blackwell arch @@ -103,4 +106,6 @@ constexpr bool isValidAmpereMOESpecialisation() #endif } -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py index 85012c79ba..61070281c4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py @@ -308,8 +308,8 @@ def get_file_content(launcher_inl_files, operations): instantiations = "\n".join(insts_list) file_content = f"""{includes} -namespace tensorrt_llm -{{ +#include "tensorrt_llm/common/config.h" +TRTLLM_NAMESPACE_BEGIN namespace kernels {{ namespace cutlass_kernels_oss @@ -319,7 +319,7 @@ namespace cutlass_kernels_oss }} // namespace cutlass_kernels_oss }} // namespace kernels -}} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END """ return file_content diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu index 7791499fd1..b2b6149d29 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace mmha @@ -176,4 +177,5 @@ INSTANTIATE_MMHA_NORMAL_AND_PAGED(__nv_bfloat16, false) //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h index 3f2705f2ee..9ef6593d16 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/kernels/gptKernels.h" @@ -26,8 +27,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -294,4 +295,5 @@ inline int estimate_min_multi_block_count(int max_timesteps, int max_dynamic_shm } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp new file mode 100644 index 0000000000..5cf342347f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e10afcbcfe15eb73c30612fa13d6a75d45e4a7fe2c5c4ec32ca4643a1508f214 +size 273632 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h index d39f5adc5d..875aaee182 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h @@ -14,1264 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -namespace tensorrt_llm -{ + +#include "tensorrt_llm/common/config.h" +#include + +TRTLLM_NAMESPACE_BEGIN + namespace kernels { -// clang-format off -// SingleQueryToken kernels. -#ifndef EXCLUDE_SM_80 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -#endif - -#ifndef EXCLUDE_SM_86 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -#endif - -#ifndef EXCLUDE_SM_89 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -#endif - -#ifndef EXCLUDE_SM_90 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; - -// MultiQueryToken kernels. -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; - -// MHA with beamWidth=4 -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; - -// SingleQueryToken kernels. -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; - -// MultiQueryToken kernels. -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; - -// MHA with beamWidth=4 -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -#endif - -#ifndef EXCLUDE_SM_120 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; - - -#endif - -static const struct XQAKernelMetaInfo +struct XQAKernelMetaInfo { Data_type mDataType; Data_type mKVDataType; @@ -1285,634 +36,13 @@ static const struct XQAKernelMetaInfo unsigned int mSM; const unsigned long long* mCubin; unsigned int mCubinSize; - const char* mFuncName; -} sXqaKernelMetaInfo[] = { -// SingleQueryToken kernels. -#ifndef EXCLUDE_SM_80 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_86 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_89 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_90 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -// MultiQueryToken kernels. -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -// MHA with beamWidth=4 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_120 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"} - -#endif + char const* mFuncName; }; +extern XQAKernelMetaInfo const sXqaKernelMetaInfo[]; +extern size_t const sXqaKernelMetaInfoSize; + // clang-format on } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h index c85f2f2c30..bf6b22385e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h @@ -17,6 +17,7 @@ #include "decoderMaskedMultiheadAttentionTemplate.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include "tensorrt_llm/kernels/gptKernels.h" @@ -32,8 +33,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -492,4 +493,5 @@ void mmha_launch_kernel(KernelParamsType const& params, KVCacheBuffer const& kv_ const KVLinearBuffer& shift_k_cache, const cudaStream_t& stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h index 21b9112b9f..5bb632465d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" @@ -37,8 +38,8 @@ #include #endif // ENABLE_MULTI_BLOCK_OPTION -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2753,4 +2754,5 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h index 6e8dce40ac..647e92cc76 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h @@ -16,11 +16,12 @@ * This file contains constants that decoderXQA*.{h,cpp} need. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { inline constexpr int kMinHistoryTokensPerBlock = 128; @@ -40,4 +41,5 @@ inline constexpr int getXqaMaxNumSubSeq(bool isMLA) } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp index 20588b0afa..8ac26a0cc8 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h" @@ -22,8 +23,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -52,4 +53,5 @@ std::unique_ptr DecoderXQAImpl::create(DecoderXQARunner* runner, } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h index f43c186d8c..7d39f36da2 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h @@ -14,13 +14,14 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -84,4 +85,5 @@ enum class XQAKernelType : int32_t }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp index bcdac05b91..dffc83764e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp @@ -16,8 +16,11 @@ * Common utils to be shared between Precompiled and JIT implementation. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { uint32_t getKernelMTileSize( @@ -59,4 +62,6 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParam isXqaJit ? std::optional(xqaParams.position_embedding_type) : std::nullopt}; } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h index f2dcb7a858..eb907edff1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h @@ -18,6 +18,7 @@ #pragma once #include "decoderXQAConstants.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -30,8 +31,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -482,4 +483,5 @@ inline int computeMultiBlockCountSpecDecGMMA( } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp index f0c71f3766..33587d7961 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp @@ -18,6 +18,7 @@ #include "cubinObj.h" #include "nvrtcWrapper/include/nvrtcWrapper.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/utils.h" @@ -44,8 +45,8 @@ void CHECK_TLLM_XQA_JIT_ERROR_(tllmXqaJitStatus result, char const* const func, } // anonymous namespace -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -133,4 +134,5 @@ CompileEngine::CompileEngine(int SM, XQAParams const& xqaParams) } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h index 01db871995..8995e03dd0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h @@ -15,12 +15,13 @@ */ #pragma once #include "cubinObj.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -43,4 +44,5 @@ private: } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp index f5910b5817..b57eec1b14 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp @@ -17,12 +17,15 @@ #include "serializationUtils.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include -namespace tensorrt_llm::kernels::jit +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::jit { CubinObj::CubinObj(void const* buffer_, size_t buffer_size) @@ -184,4 +187,6 @@ CubinObj::~CubinObj() } } -} // namespace tensorrt_llm::kernels::jit +} // namespace kernels::jit + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h index 4eb3ca1095..3cb176407f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h @@ -14,14 +14,15 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -86,4 +87,5 @@ private: } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h index 468cd77bc1..2eb9ef89db 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h @@ -18,13 +18,16 @@ #include "compileEngine.h" #include "serializationUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include #include #include -namespace tensorrt_llm::kernels::jit +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::jit { // A thread-safe collection of CubinObjs, with caching functionality. @@ -173,4 +176,6 @@ using CubinObjKey = XQAKernelFullHashKey; using CubinObjHasher = XQAKernelFullHasher; using CubinObjRegistry = CubinObjRegistryTemplate; -} // namespace tensorrt_llm::kernels::jit +} // namespace kernels::jit + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index 03295d6d16..90dda051a0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -17,6 +17,7 @@ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h" #include "compileEngine.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/utils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h" @@ -43,7 +44,9 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromKernelMeta(XQAKernelMetaInfo const& } // anonymous namespace -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { DecoderXQAImplJIT::DecoderXQAImplJIT(DecoderXQARunner* runner) @@ -545,4 +548,6 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h index b051d7bd35..902ec0b809 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" #include "compileEngine.h" @@ -23,8 +24,8 @@ #include "tensorrt_llm/plugins/common/plugin.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -75,4 +76,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp index c19b482b30..26fadd21cc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp @@ -14,12 +14,13 @@ * limitations under the License. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/utils.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -205,4 +206,5 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h index c67e54459c..8d3b43b44f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h @@ -14,11 +14,12 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" #include "tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -32,4 +33,5 @@ bool supportConfigTllmGen( } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h index f48af0f7c8..456680907d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h @@ -14,13 +14,14 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace jit @@ -49,4 +50,5 @@ void writeToBuffer(T output, uint8_t*& buffer, size_t& remaining_buffer_size) } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index 2cf90486d3..7bd7c32e5e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/workspace.h" @@ -33,7 +34,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class XQAKernelList @@ -44,7 +47,7 @@ public: XQAKernelList(Data_type type, unsigned int sm) : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance()) , mDataType(type) - , mKernelMetaCount(sizeof(sXqaKernelMetaInfo) / sizeof(sXqaKernelMetaInfo[0])) + , mKernelMetaCount(sXqaKernelMetaInfoSize) , mKernelMeta(&sXqaKernelMetaInfo[0]) , mSM(sm) { @@ -557,4 +560,6 @@ void DecoderXQAImplPrecompiled::runWithKVBlockArray( runDispatchBuffer(xqa_params, kv_block_array, stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h index e41d637597..7f48b47468 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h @@ -14,10 +14,11 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -47,4 +48,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp index 946fea5a7e..165ffc2848 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp @@ -22,6 +22,7 @@ #include #include +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/workspace.h" @@ -31,8 +32,8 @@ #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -181,4 +182,4 @@ void DecoderXQARunnerResource::serialize(void* buffer, size_t buffer_size) const } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h index 1604c697fe..b53bd4a94e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h @@ -20,6 +20,7 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h" @@ -32,8 +33,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -157,4 +158,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu index 4ed7b39b88..1d24c2fc3e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu index bbc6e0ed17..99e185e64d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu index 17e3601acf..c863acba6b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu index bdce42d97c..e98633b4f5 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu index bcc07aa8a0..20681f3274 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu index 0b6497b092..cc870a5256 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu index 3eacc7a74f..d971b5d76e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu index 65e747caf6..e60b735945 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_BLOCK_SPARSE_ATTN(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu index a43569fa1a..64df7751fb 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(__nv_bfloat16, kSizePerHe } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu index 48f2a413f0..afa21e48ca 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(__nv_bfloat16, kSiz } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu index fc652b5a4f..bb7ecbafea 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu index ee15867353..0914573412 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_BLOCK_SPARSE_ATTN(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu index 74c708b767..3aa0970e0b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu index e12f887d8f..0a4573c21a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(float, kSizePerHead } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu index 4078e2bc60..3a224a79f2 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu index 4f61bf42a2..cb0574baad 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_BLOCK_SPARSE_ATTN(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu index 867c2df240..b02a92a351 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu index 8b7d988b0c..40de9b4dd7 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(uint16_t, kSizePerH } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu index 72aab18ab9..8cfc95fec6 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu index d5b2ab6627..825add47ff 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu index 79d3f3920a..a07e1340ed 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu index bd65335d75..7230657f3e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu index e7f7f1bf76..09b32df680 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu index 8928b538c5..7c13505994 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu index 0229ec07b0..d799feb598 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu index 0fca76aa35..f79fa11615 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu index 181cf5c8f3..e49050ab7f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu index d25a1d901f..b40711f997 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu index 3eded458eb..0dc1a472a1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu index d80110c60e..2b63fb389e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu index 33d1724961..696e2b9bab 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu index 786cbafca3..e18af09838 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(__nv_bfloat16, kSiz } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu index 44e030d532..deb057598f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu index 985a24c45b..7c5c498e08 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(float, kSizePerHead } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu index 016b10fc50..90469e87d0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu index 2a709eecd5..7d27fe99a9 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(uint16_t, kSizePerH } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu index 6afa825ae8..bb6c6ee48d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu index 1906b9816a..127477fd71 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(__nv_bfloat16, kSizePerHe } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu index 28ca9c7e82..9404f14a29 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu index 9550440780..b9fc4249b9 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu index ba9ee36cc2..73d4bf4773 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu index 288338f946..5f289fad6f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu index 6cd98308a6..98f5956732 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu index 72c2ef160e..09c1d6f8f4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu index df10f905de..96c271547a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu index 90f338470e..0eb62b8567 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu index dbbccf2d0f..a739bafe59 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(__nv_bfloat16, kSizePerHe } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu index 775ed1038d..bb0b54ec88 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu index 87726296e3..ae3be8f097 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu index 4d29cc40fa..77f0539380 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu index a247a07a3f..59caa0fae7 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu index 11ecb92a66..8c564959d0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu index af9f4f4fec..76e54cf297 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu index 3f8e9c4c23..c50b41b187 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu index 286ed2b2fb..3b6d1c6c0f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu index ef886b9412..88217b08bc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu index af8f7fa4d2..b1a188a6ea 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp index 6c2180ba80..e4b642a11e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp @@ -14,12 +14,15 @@ * limitations under the License. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace @@ -183,4 +186,6 @@ CUtensorMap makeTensorMapForXqaMlaQ( return makeTensorMapForQ(driver, q, CU_TENSOR_MAP_DATA_TYPE_UINT8, xqaParams.head_size, xqaParams.num_q_heads * xqaParams.total_num_input_tokens, partElems, xqaParams.num_q_heads); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h index da4240d277..03b2373bcd 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h @@ -14,11 +14,12 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ CUtensorMap makeTensorMapForXqaMlaQ( std::shared_ptr const& driver, XQAParams const& xqaParams, void const* q); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h index 35115b8cb6..6ac232e499 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h @@ -14,13 +14,14 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" #include "tensorrt_llm/kernels/sparseAttentionKernels.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -206,4 +207,5 @@ struct XQAParams }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h index 09bd551c0b..aa7e31dbd1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/kernels/gptKernels.h" #include @@ -31,8 +32,8 @@ using tensorrt_llm::common::float22bf162; using tensorrt_llm::common::hsub2; #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -4256,4 +4257,5 @@ __device__ __host__ constexpr inline T const& const_max(T const& a, T const& b) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decodingCommon.cu b/cpp/tensorrt_llm/kernels/decodingCommon.cu index ad8249a3a6..1e091d6961 100644 --- a/cpp/tensorrt_llm/kernels/decodingCommon.cu +++ b/cpp/tensorrt_llm/kernels/decodingCommon.cu @@ -14,10 +14,11 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/decodingCommon.h" - +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" + #include "tensorrt_llm/common/reduceKernelUtils.cuh" +#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include @@ -25,7 +26,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __global__ void curandInitialize(curandState_t* state, int const* batchSlots, int const size, uint64_t const randomSeed) @@ -235,4 +238,6 @@ template void invokeScatterDecodingParams( template void invokeScatterDecodingParams( int32_t const* src, int32_t scalar, int32_t* dst, int const* batchSlots, int batchSize, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu index 77bc6b71ae..98b25dde4c 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/decodingKernels.h" @@ -30,8 +31,7 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -712,7 +712,9 @@ void invokeTransposeLogProbs(float* outputLogProbs, float* outputLogProbsTiled, } // namespace kernels -namespace runtime::kernels +TRTLLM_NAMESPACE_END + +namespace tensorrt_llm::runtime::kernels { // Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, @@ -802,6 +804,4 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -} // namespace runtime::kernels - -} // namespace tensorrt_llm +} // namespace tensorrt_llm::runtime::kernels diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h index cf648c7605..0e4fded936 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.h +++ b/cpp/tensorrt_llm/kernels/decodingKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/decodingInput.h" @@ -25,8 +26,7 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -117,7 +117,9 @@ void invokeTransposeLogProbs(float* output_log_probs, float* output_log_probs_ti } // namespace kernels -namespace runtime::kernels +TRTLLM_NAMESPACE_END + +namespace tensorrt_llm::runtime::kernels { //! \brief Inserts the running beams into the finished beams stored in the CBA buffers. (beams where the most likely //! continuation is the end token get stored separately, and another candidate next token is stored). Then sorts the @@ -132,6 +134,4 @@ namespace runtime::kernels void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream); -} // namespace runtime::kernels - -} // namespace tensorrt_llm +} // namespace tensorrt_llm::runtime::kernels diff --git a/cpp/tensorrt_llm/kernels/delayStream.cu b/cpp/tensorrt_llm/kernels/delayStream.cu index ec0146c4b8..89b4b2cca9 100644 --- a/cpp/tensorrt_llm/kernels/delayStream.cu +++ b/cpp/tensorrt_llm/kernels/delayStream.cu @@ -13,12 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/delayStream.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __global__ void delayStreamKernel(long long delay_micro_secs) { @@ -34,4 +37,6 @@ void invokeDelayStreamKernel(long long delay_micro_secs, cudaStream_t stream) delayStreamKernel<<<1, 1, 0, stream>>>(delay_micro_secs); check_cuda_error(cudaGetLastError()); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/delayStream.h b/cpp/tensorrt_llm/kernels/delayStream.h index 8266416da6..65035e3a82 100644 --- a/cpp/tensorrt_llm/kernels/delayStream.h +++ b/cpp/tensorrt_llm/kernels/delayStream.h @@ -16,9 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { void invokeDelayStreamKernel(long long delay_micro_secs, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/doraScaling.cu b/cpp/tensorrt_llm/kernels/doraScaling.cu index c2308f0874..bd441cfb49 100644 --- a/cpp/tensorrt_llm/kernels/doraScaling.cu +++ b/cpp/tensorrt_llm/kernels/doraScaling.cu @@ -14,12 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaUtils.h" // TODO(oargov): literally zero performance optimization work was put into these kernels and their launch parameters, // since they should hopefully be fused to some gemm eventually. -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template __global__ void tokenPerChannelScaleKernel(size_t const numModules, size_t const numTokens, @@ -89,4 +92,6 @@ template void tokenPerChannelScale(int64_t const numel, size_t cons nv_bfloat16 const* const* __restrict__ scale_ptrs, nv_bfloat16* __restrict__ result, cudaStream_t stream); #endif -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/doraScaling.h b/cpp/tensorrt_llm/kernels/doraScaling.h index 4b24f26ff2..9df8661e07 100644 --- a/cpp/tensorrt_llm/kernels/doraScaling.h +++ b/cpp/tensorrt_llm/kernels/doraScaling.h @@ -15,14 +15,16 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template void tokenPerChannelScale(int64_t const numel, size_t const numModules, size_t const numGroups, int64_t const* __restrict__ cumModuleSizes, T const* a, T const* const* scale_ptrs, T* result, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu index 1480be8140..8e8e819117 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu @@ -21,6 +21,7 @@ #include "cuda.h" #include "cuda_bf16.h" #include "cuda_runtime.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h" @@ -29,7 +30,9 @@ using bf16_t = __nv_bfloat16; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { __device__ void hmma_16_8_16_f32acc_bf16ab( @@ -681,4 +684,6 @@ template void invokeFusedAGemm<__nv_bfloat16, 7168, 2112, 8>( template void invokeFusedAGemm<__nv_bfloat16, 7168, 2112, 16>( __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, int num_tokens, cudaStream_t); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h index 36548da54c..6adaec89da 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h @@ -17,15 +17,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { template void invokeFusedAGemm(T* output, T const* mat_a, T const* mat_b, int num_tokens, cudaStream_t const stream); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu index 34557cc490..0b406e103f 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu @@ -14,11 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" + #include "tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { // Custom FMA implementation using PTX assembly instructions @@ -238,4 +242,6 @@ template void tensorrt_llm::kernels::dsv3MinLatencyKernels::invokeRouterGemm<__n template void tensorrt_llm::kernels::dsv3MinLatencyKernels::invokeRouterGemm<__nv_bfloat16, 16, 256, 7168>( float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h index 948b1ef8d4..ffd77cf12a 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h @@ -16,15 +16,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { template void invokeRouterGemm(float* output, T const* mat_a, T const* mat_b, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp index b46564d49a..4103729940 100644 --- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp @@ -15,9 +15,12 @@ */ #include "fmhaDispatcher.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -247,4 +250,6 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams) //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.h b/cpp/tensorrt_llm/kernels/fmhaDispatcher.h index f79c55d380..26a40411fd 100644 --- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.h +++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" @@ -23,7 +24,9 @@ using tensorrt_llm::common::op::UniqPtrWNullCopy; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -62,4 +65,6 @@ private: //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh index 13de943b43..eda5f38d31 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBufferUtils.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -29,7 +30,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -261,4 +264,6 @@ struct FP4Converter } }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h index 22c1dc40ed..0e05e0a835 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h @@ -16,10 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -41,4 +44,6 @@ struct GeneralFP4AddBiasResidualPreLayerNormParam cudaStream_t stream; }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh index a9cf71a2a8..1e2ebd62d0 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBufferUtils.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -27,7 +28,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -333,4 +336,6 @@ struct LowLatencyLayerNorm } }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh index 51c6ca7564..5776c41119 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBufferUtils.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -25,7 +26,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { struct DummyFusedOperator @@ -838,4 +841,6 @@ __global__ void __launch_bounds__(TARGET_THREADS, 1) warpSpecializedInvoker(type T::run(param); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h old mode 100755 new mode 100644 index b5c00f90ce..c7579251fb --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h @@ -15,9 +15,12 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { struct WarpSpecializedCounters @@ -43,4 +46,6 @@ enum class SCALE_TYPE template void invokeWSLayerNorm(WarpSpecializedParam param, bool use_rms_norm, int ctas); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu index 4dc10f05e7..9103491cdd 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/common/logger.h" @@ -25,7 +26,9 @@ using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -317,4 +320,6 @@ void invokeWSLayerNorm invokeWSLayerNormImpl(param, use_rms_norm, ctas); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu index 7b53818762..633b276b12 100644 --- a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu +++ b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu @@ -14,16 +14,17 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/fusedMoeCommKernels.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/cudaUtils.h" #include -#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/fusedMoeCommKernels.h" #include "tensorrt_llm/kernels/quantization.cuh" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1779,4 +1780,5 @@ void launchLocalFifoSendRecv(FusedMoeFieldInfo const& sendFieldInfo, FusedMoeFie } // namespace fused_moe_comm_tests } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h index 7a17257bff..31aab22507 100644 --- a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h +++ b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h @@ -19,12 +19,13 @@ #include +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/moeCommKernelsCommon.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -558,4 +559,5 @@ void launchLocalFifoSendRecv(FusedMoeFieldInfo const& sendFieldInfo, FusedMoeFie } // namespace fused_moe_comm_tests } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu index 80245d0b52..73326af8c4 100644 --- a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu +++ b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu @@ -15,6 +15,7 @@ */ #include "fusedQKNormRopeKernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/mathUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -24,7 +25,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // Specialization for packed_as used in this kernel. template <> @@ -44,9 +47,12 @@ struct packed_as { using type = uint4; }; -} // namespace tensorrt_llm::common +} // namespace common -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_END +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -327,4 +333,6 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_ default: TLLM_THROW("Unsupported head dimension for fusedQKNormRope: %d", head_dim); } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h index 85d71f7e7c..7dab7dbbb2 100644 --- a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h +++ b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h @@ -16,10 +16,11 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -46,4 +47,5 @@ void launchFusedQKNormRope( bool is_qk_norm); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/gptKernels.cu b/cpp/tensorrt_llm/kernels/gptKernels.cu index 7d6332d1a4..082709e7af 100644 --- a/cpp/tensorrt_llm/kernels/gptKernels.cu +++ b/cpp/tensorrt_llm/kernels/gptKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -26,8 +27,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -358,4 +359,5 @@ __global__ void updatePaddingCountKernel(int* paddingPerSeq, int const* seqLengt } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h index 38c56be902..f5ba9a1b76 100644 --- a/cpp/tensorrt_llm/kernels/gptKernels.h +++ b/cpp/tensorrt_llm/kernels/gptKernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" #include "tensorrt_llm/runtime/iTensor.h" #include @@ -22,8 +23,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -275,4 +276,5 @@ template void invokeBuildDecoderInfo(BuildDecoderInfoParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupGemm.cu b/cpp/tensorrt_llm/kernels/groupGemm.cu index 5305e85a4f..5b8c0d9291 100644 --- a/cpp/tensorrt_llm/kernels/groupGemm.cu +++ b/cpp/tensorrt_llm/kernels/groupGemm.cu @@ -24,12 +24,13 @@ #include "groupGemm.h" #include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -259,4 +260,4 @@ void groupedGemm(std::vector problem_sizes, std::vecto } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupGemm.h b/cpp/tensorrt_llm/kernels/groupGemm.h index 0fabcb9562..dbc1e498b7 100644 --- a/cpp/tensorrt_llm/kernels/groupGemm.h +++ b/cpp/tensorrt_llm/kernels/groupGemm.h @@ -16,10 +16,11 @@ #pragma once #include "cutlass/gemm_coord.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -32,4 +33,4 @@ void groupedGemm(std::vector problem_sizes, std::vecto } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu index b13c8e100f..58b6bc9d8f 100644 --- a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu +++ b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include @@ -23,7 +24,9 @@ #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h" -namespace tensorrt_llm::kernels::group_rms_norm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::group_rms_norm { // Helper function to calculate the number of warps to launch for GroupRMSNormBase template @@ -876,4 +879,6 @@ void GroupRMSNormKernelLauncherWithHeuristic(GroupRMSParams& params) INSTANTIATE_GROUP_RMS_NORM_WITH_HEURISTIC(1) INSTANTIATE_GROUP_RMS_NORM_WITH_HEURISTIC(2) -} // namespace tensorrt_llm::kernels::group_rms_norm +} // namespace kernels::group_rms_norm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h index c121705f6d..335adf44ed 100644 --- a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h +++ b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h @@ -14,15 +14,18 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels::group_rms_norm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::group_rms_norm { template @@ -73,4 +76,6 @@ void GroupRMSNormKernelLargeBatchLauncher(GroupRMSParams& params); template void GroupRMSNormKernelLauncherWithHeuristic(GroupRMSParams& params); -} // namespace tensorrt_llm::kernels::group_rms_norm +} // namespace kernels::group_rms_norm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/helixKernels.cu b/cpp/tensorrt_llm/kernels/helixKernels.cu index c08b244de9..ffaa490b14 100644 --- a/cpp/tensorrt_llm/kernels/helixKernels.cu +++ b/cpp/tensorrt_llm/kernels/helixKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/helixKernels.h" @@ -29,8 +30,8 @@ using namespace tensorrt_llm::common; namespace cg = cooperative_groups; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { static constexpr int WARP_SIZE = 32; @@ -240,4 +241,5 @@ INSTANTIATE_POST_PROC(__half); INSTANTIATE_POST_PROC(__nv_bfloat16); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/helixKernels.h b/cpp/tensorrt_llm/kernels/helixKernels.h index 2a0e632434..d7b96e32bd 100644 --- a/cpp/tensorrt_llm/kernels/helixKernels.h +++ b/cpp/tensorrt_llm/kernels/helixKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include @@ -23,8 +24,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template @@ -43,4 +44,5 @@ template void helixPostProcess(HelixPostProcParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu b/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu index 3cb35273a9..3132d166f6 100644 --- a/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu +++ b/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu @@ -16,9 +16,12 @@ #include "IndexerKCacheScatter.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace @@ -149,4 +152,6 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca TLLM_CUDA_CHECK(cudaGetLastError()); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu index 40e377c998..740e83f0bb 100644 --- a/cpp/tensorrt_llm/kernels/indexerTopK.cu +++ b/cpp/tensorrt_llm/kernels/indexerTopK.cu @@ -16,6 +16,7 @@ */ #include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/noAuxTcKernels.h" @@ -25,7 +26,9 @@ namespace cg = cooperative_groups; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace { @@ -766,4 +769,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index 6f777b25ff..ac28ba8f9f 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0482a61bb6d9435386aa5dcf155145e51cc6f820bfc52ffdecb0dd12c0368ae4 -size 67086296 +oid sha256:0a345d90233d94c0b3f6b9f5c6e79152852354e174f0edd68f00c2554e9e32b5 +size 67111548 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index 4563244946..eb6005bb71 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -40a3ef577419b5a9c6d5ca0d3201603889622eb62048319f657cbffc2c076be3 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 33f251e0599197ad3e6c59d64a42f9721d3cc27c +389ecc2585d407dcf336cfb5d1fdf7cdf77922998b0560743c5b162172fa57c1 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 9fc66c405c7caaaeb65542ba1498f00d863f0a4a diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h index b7eba1ab34..09c1fbd586 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -23,18 +24,29 @@ #include "cutlass/layout/layout.h" #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" -namespace tensorrt_llm::kernels::cutlass_kernels -{ using namespace cute; using namespace tensorrt_llm::cutlass_extensions; +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels +{ enum GemmAllReduceImpl { kNVLS_2SHOT }; +// Specifies whether to use SM or switch for allreduce. +// SM is more efficient for GPUs=2 and switch for GPUs>2. +enum ReduceLocationType +{ + kSM, + kSWITCH +}; + // Decouples IPluginResource from the GemmAllReduce runner interface. class PersistentWorkspaceInterface { @@ -42,7 +54,6 @@ public: virtual ~PersistentWorkspaceInterface() = default; virtual void allocate() = 0; virtual int free() = 0; - virtual size_t size() = 0; }; class GemmAllReduceImplInterface @@ -55,6 +66,7 @@ public: { GemmAllReduceImpl impl; MainloopScheduleType schedule; + ReduceLocationType reduce_location; TileShape tile_shape; ClusterShape cluster_shape; int MMA_SMs; @@ -71,10 +83,21 @@ public: return ""; }; + auto get_reduction_name = [&]() + { + switch (reduce_location) + { + case ReduceLocationType::kSM: return "SM"; + case ReduceLocationType::kSWITCH: return "Switch"; + } + return ""; + }; + std::stringstream ss; ss << "LaunchConfig("; ss << get_impl_name(); ss << ", Schedule_" << get_mainloop_schedule_name(schedule); + ss << ", Reduction_" << get_reduction_name(); ss << ", TileShape_" << get_tile_shape_name(tile_shape); ss << ", ClusterShape_" << get_cluster_shape_name(cluster_shape); ss << ", MmaSms_" << MMA_SMs; @@ -84,8 +107,8 @@ public: bool operator<(LaunchConfig const& other) const { - return std::tie(impl, schedule, tile_shape, cluster_shape, MMA_SMs) - < std::tie(other.impl, other.schedule, other.tile_shape, other.cluster_shape, other.MMA_SMs); + return std::tie(impl, schedule, reduce_location, tile_shape, cluster_shape, MMA_SMs) < std::tie(other.impl, + other.schedule, other.reduce_location, other.tile_shape, other.cluster_shape, other.MMA_SMs); } }; @@ -248,4 +271,6 @@ private: std::map mGemmRegistry; }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h index 25b4aff8f3..37f55f3edd 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h @@ -21,13 +21,14 @@ #include #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace internal_cutlass_kernels @@ -98,4 +99,5 @@ private: } // namespace internal_cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h index fed9276e03..6cb38013c4 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h @@ -18,17 +18,14 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include -// namespace tk = tensorrt_llm::common; +TRTLLM_NAMESPACE_BEGIN -namespace tkc = tensorrt_llm::cutlass_extensions; - -namespace tensorrt_llm -{ namespace kernels { namespace internal_cutlass_kernels @@ -127,4 +124,4 @@ private: }; // namespace internal_cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h index 9b6e4f042f..ed52b52928 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h @@ -17,13 +17,14 @@ #pragma once #include "low_latency_gemm.h" +#include "tensorrt_llm/common/config.h" // namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace internal_cutlass_kernels @@ -73,4 +74,5 @@ private: }; // namespace internal_cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h index b00fa18e11..e3d62ef3b7 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/workspace.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h" @@ -37,9 +38,7 @@ #include #endif -namespace tensorrt_llm -{ - +TRTLLM_NAMESPACE_BEGIN // Note update moe.py to match enum class ActivationType { @@ -50,7 +49,6 @@ enum class ActivationType Geglu, SwigluBias, Identity, - Relu2, InvalidType }; @@ -196,8 +194,7 @@ struct TmaWarpSpecializedGroupedGemmInput struct INT4GroupwiseParams { - constexpr static int int4_group_size = 128; - constexpr static int wfp4a16_group_size = 32; + constexpr static int group_size = 128; // Unused, hard-coded to 128 bool enabled = false; using SFA = __nv_bfloat16; using SFB = __nv_bfloat16; // Unused @@ -266,6 +263,7 @@ public: #else static constexpr bool use_fp8 = false; static constexpr bool use_w4afp8 = false; + static constexpr bool use_wfp4afp4 = false; #endif #if defined(ENABLE_FP4) @@ -316,4 +314,4 @@ private: size_t calcMaxWorkspaceSize(int num_experts) const; }; -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h index a68e0b9bfe..132990603d 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h @@ -19,10 +19,10 @@ #include "cutlass/gemm/gemm.h" #include "moe_gemm_kernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" -#include #ifdef ENABLE_FP4 #include #endif @@ -34,7 +34,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static inline size_t pad_to_multiple_of_16(size_t const& input) @@ -425,9 +427,9 @@ public: virtual void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, - QuantParams quant_params, int64_t const num_rows, int64_t const num_valid_rows, int64_t const hidden_size, - int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr, - void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool use_lora, + QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output, + int* expanded_source_row_to_expanded_dest_row, MOEParallelismConfig parallelism_config, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream) = 0; @@ -439,11 +441,11 @@ public: int64_t const* const num_valid_tokens_ptr, void const* const fc1_int_scales, float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, - int64_t const num_rows, int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, - int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, - bool use_deepseek_fp8_block_scale, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, - bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert) + int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, + bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) = 0; virtual void gemm2(void const* const input, void* const gemm_output, void* const final_output, @@ -451,14 +453,14 @@ public: void const* const fc2_expert_weights, void const* const fc2_expert_biases, void const* const fc2_int_scales, float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat, QuantParams quant_params, float const* const token_topk_unpermuted_scales, - float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row, - int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row, + float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row, + int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array, - bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream, - MOEParallelismConfig parallelism_config, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids, int start_expert) + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, + bool use_deepseek_fp8_block_scale, cudaStream_t stream, MOEParallelismConfig parallelism_config, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) = 0; virtual std::pair @@ -470,7 +472,7 @@ public: TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales, - int const* permuted_row_to_unpermuted_row, cudaStream_t stream) + int const* expanded_dest_row_to_expanded_source_row, cudaStream_t stream) = 0; virtual std::pair @@ -573,9 +575,9 @@ public: void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, - QuantParams quant_params, int64_t const num_rows, int64_t const num_valid_rows, int64_t const hidden_size, - int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr, - void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool use_lora, + QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output, + int* expanded_source_row_to_expanded_dest_row, MOEParallelismConfig parallelism_config, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream) override; @@ -593,11 +595,10 @@ public: ScaleBiasType const* const fc1_int_scales, float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, - int64_t const num_rows, int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, - int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, - cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids, int start_expert); + int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, + bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, + bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert); static void gemm2(MoeGemmRunner& gemm_runner, DeepSeekBlockScaleGemmRunner* fp8_blockscale_gemm_runner, T const* const input, void* const gemm_output, @@ -606,14 +607,13 @@ public: ScaleBiasType const* const fc2_expert_biases, ScaleBiasType const* const fc2_int_scales, float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat, QuantParams quant_params, float const* const token_topk_unpermuted_scales, - float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row, - int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row, + float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row, + int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array, - bool use_lora, void* fc2_lora, cudaStream_t stream, MOEParallelismConfig parallelism_config, - cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, - int* active_expert_global_ids, int start_expert); + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, + cudaStream_t stream, MOEParallelismConfig parallelism_config, cutlass_extensions::CutlassGemmConfig config, + bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert); // Overrides to allow us to forward on to the internal functions with the pointers using the correct type void gemm1(void const* const input, void* const output, void* const intermediate_result, @@ -622,21 +622,20 @@ public: int64_t const* const num_valid_tokens_ptr, void const* const fc1_int_scales, float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, - int64_t const num_rows, int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, - int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, - bool use_deepseek_fp8_block_scale, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, - bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert) override + int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, + bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) override { auto* block_scale_gemm_runner = use_deepseek_fp8_block_scale ? getDeepSeekBlockScaleGemmRunner() : nullptr; return Self::gemm1(moe_gemm_runner_, block_scale_gemm_runner, static_cast(input), static_cast(output), intermediate_result, expert_first_token_offset, tma_ws_input_template, static_cast(fc1_expert_weights), static_cast(fc1_expert_biases), num_valid_tokens_ptr, static_cast(fc1_int_scales), fc1_fp8_dequant, fc2_fp8_quant, - fc1_fp4_act_flat, fc2_fp4_act_flat, quant_params, num_rows, expanded_num_rows, expected_tokens_per_expert, - hidden_size, inter_size, num_experts_per_node, fc1_activation_type, alpha_scale_ptr_array, - bias_is_broadcast, stream, config, min_latency_mode, num_active_experts_per, active_expert_global_ids, - start_expert); + fc1_fp4_act_flat, fc2_fp4_act_flat, quant_params, num_rows, expanded_num_rows, hidden_size, inter_size, + num_experts_per_node, fc1_activation_type, alpha_scale_ptr_array, bias_is_broadcast, stream, config, + min_latency_mode, num_active_experts_per, active_expert_global_ids, start_expert); } void gemm2(void const* const input, void* const gemm_output, void* const final_output, @@ -644,25 +643,25 @@ public: void const* const fc2_expert_weights, void const* const fc2_expert_biases, void const* const fc2_int_scales, float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat, QuantParams quant_params, float const* const token_topk_unpermuted_scales, - float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row, - int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row, + float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row, + int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array, - bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream, - MOEParallelismConfig parallelism_config, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids, int start_expert) override + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, + bool use_deepseek_fp8_block_scale, cudaStream_t stream, MOEParallelismConfig parallelism_config, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) override { auto* block_scale_gemm_runner = use_deepseek_fp8_block_scale ? getDeepSeekBlockScaleGemmRunner() : nullptr; return Self::gemm2(moe_gemm_runner_, block_scale_gemm_runner, static_cast(input), gemm_output, static_cast(final_output), expert_first_token_offset, tma_ws_input_template, static_cast(fc2_expert_weights), static_cast(fc2_expert_biases), static_cast(fc2_int_scales), fc2_fp8_dequant, fc2_fp4_act_flat, quant_params, - token_topk_unpermuted_scales, token_topk_permuted_scales, unpermuted_row_to_permuted_row, - permuted_row_to_unpermuted_row, expert_for_source_row, num_valid_tokens_ptr, num_rows, expanded_num_rows, - expected_tokens_per_expert, hidden_size, inter_size, num_experts_per_node, experts_per_token, - alpha_scale_ptr_array, use_lora, fc2_lora, stream, parallelism_config, config, min_latency_mode, - num_active_experts_per, active_expert_global_ids, start_expert); + token_topk_unpermuted_scales, token_topk_permuted_scales, expanded_source_row_to_expanded_dest_row, + expanded_dest_row_to_expanded_source_row, expert_for_source_row, num_valid_tokens_ptr, num_rows, + expanded_num_rows, hidden_size, inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array, + use_lora, fc2_lora, stream, parallelism_config, config, min_latency_mode, num_active_experts_per, + active_expert_global_ids, start_expert); } virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const override @@ -679,7 +678,7 @@ public: TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales, - int const* permuted_row_to_unpermuted_row, cudaStream_t stream) override + int const* expanded_dest_row_to_expanded_source_row, cudaStream_t stream) override { return Self::computeStridesTmaWarpSpecialized(expert_first_token_offset, layout_info1, layout_info2, num_tokens, expanded_num_tokens, gemm1_n, gemm1_k, gemm2_n, gemm2_k, num_experts_per_node, @@ -688,8 +687,8 @@ public: alpha_scale_flat1, alpha_scale_flat2, fp4_act_flat1, fp4_act_flat2, quant_params, reinterpret_cast(bias1), reinterpret_cast(bias2), reinterpret_cast(gemm1_output), - reinterpret_cast(gemm2_output), router_scales, permuted_row_to_unpermuted_row, - stream); + reinterpret_cast(gemm2_output), router_scales, + expanded_dest_row_to_expanded_source_row, stream); } std::pair @@ -731,8 +730,8 @@ private: float const* alpha_scale_flat2, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, ScaleBiasType const* bias1, ScaleBiasType const* bias2, UnfusedGemmOutputType* gemm1_output, - UnfusedGemmOutputType* gemm2_output, float const* router_scales, int const* permuted_row_to_unpermuted_row, - cudaStream_t stream); + UnfusedGemmOutputType* gemm2_output, float const* router_scales, + int const* expanded_dest_row_to_expanded_source_row, cudaStream_t stream); static std::pair computeStridesTmaWarpSpecializedLowLatency(TmaWarpSpecializedGroupedGemmInput layout_info1, TmaWarpSpecializedGroupedGemmInput layout_info2, int64_t num_tokens, int64_t gemm1_n, int64_t gemm1_k, @@ -793,18 +792,17 @@ private: void* const intermediate_result, int64_t const* const expert_first_token_offset, WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases, float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, ActivationParams fc1_activation_type, QuantParams& quant_params, - cudaStream_t stream); + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream); static void BlockScaleFC2(DeepSeekBlockScaleGemmRunner& gemm_runner, T const* const input, void* const gemm_output, OutputType* const final_output, int64_t const* const expert_first_token_offset, WeightType const* const fc2_expert_weights, ScaleBiasType const* const fc2_expert_biases, - float const* const token_topk_unpermuted_scales, int const* const unpermuted_row_to_permuted_row, + float const* const token_topk_unpermuted_scales, int const* const expanded_source_row_to_expanded_dest_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, - int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, int64_t const hidden_size, - int64_t const unpadded_hidden_size, int64_t const inter_size, int const num_experts_per_node, int64_t const k, - MOEParallelismConfig parallelism_config, QuantParams& quant_params, cudaStream_t stream); + int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, int64_t const k, MOEParallelismConfig parallelism_config, + QuantParams& quant_params, cudaStream_t stream); T const* applyPrequantScale(void* smoothed_act, void const* permuted_data, void const* prequant_scales, int64_t const* num_valid_tokens_ptr, int64_t const expanded_num_rows, int64_t const seq_len, bool const use_awq, @@ -960,4 +958,6 @@ private: // Populates a buffer with random values for use with MOE benchmarking void populateRandomBuffer(void* buffer_void, size_t size, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index 769039f568..935aabe42d 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4c70e6e756b7c4efb0abcd0156e38d10481e9493e48fd140f9efcd1cdda68a3 -size 66889324 +oid sha256:d74cbe0df4f798fbc0c157280ebcc734ad6d1897ba3b43026e4aa22a2a4480a5 +size 66904288 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index b37609a070..4194d5219e 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -9db5ce2be51af2d4bd983af497ac9dbe53d8c57284d7ba455babd95c202db7d4 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 33f251e0599197ad3e6c59d64a42f9721d3cc27c +a396f947f273fc752469160c9ae83caf393017d096cf4881ee09ad6af64296e1 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 9fc66c405c7caaaeb65542ba1498f00d863f0a4a diff --git a/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu b/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu index e5675172ac..3b91cf3f17 100644 --- a/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu +++ b/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu @@ -14,12 +14,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCachePartialCopy.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace @@ -133,4 +134,5 @@ void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numL } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/kvCacheUtils.h b/cpp/tensorrt_llm/kernels/kvCacheUtils.h index 065c2e7b70..166f476112 100644 --- a/cpp/tensorrt_llm/kernels/kvCacheUtils.h +++ b/cpp/tensorrt_llm/kernels/kvCacheUtils.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheIndex.h" #include @@ -24,7 +25,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { // Internal for K and V cache indexing @@ -38,7 +41,7 @@ enum class KVIdxType : int32_t // only the fields necessary for context FMHA struct KVBlockArrayForContextFMHA { - using DataType = KVCacheIndex const; + using DataType = ::tensorrt_llm::kernels::KVCacheIndex const; // The maximum number of sequences supported by the kv-cache. int32_t mMaxSeqs; @@ -322,4 +325,6 @@ struct KVLinearBuffer } }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/layernormKernels.cu b/cpp/tensorrt_llm/kernels/layernormKernels.cu index e7943d04c2..f8dbd9343e 100644 --- a/cpp/tensorrt_llm/kernels/layernormKernels.cu +++ b/cpp/tensorrt_llm/kernels/layernormKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/quantTypeUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -21,8 +22,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -340,4 +341,5 @@ INSTANTIATE_GENERAL_LAYERNORM(__nv_bfloat16, __nv_fp8_e4m3); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/layernormKernels.h b/cpp/tensorrt_llm/kernels/layernormKernels.h index d2e7335e03..08581713d9 100644 --- a/cpp/tensorrt_llm/kernels/layernormKernels.h +++ b/cpp/tensorrt_llm/kernels/layernormKernels.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeGeneralLayerNorm(T* out, T const* input, T const* gamma, T const* bet float* dynamic_scale = nullptr, float* sum_per_token = nullptr, QuantT* out_quant = nullptr); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu index a43c8cfd32..7bdf7f593a 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu @@ -14,10 +14,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm { struct __align__(8) aligned_bf16x4 @@ -125,4 +128,6 @@ void llama4_bf16_bf16_gemm_op(int num_tokens, void const* A, void const* B, void llama4_bf16_bf16_gemm_launcher(num_tokens, A_bf16, B_bf16, C_bf16, stream); } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h index 18104f2a2b..a9d079a7cb 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h @@ -15,13 +15,18 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm { void llama4_bf16_bf16_gemm_op(int num_tokens, void const* A, void const* B, void* C, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu index aa54651f0d..53efc2d24a 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh" @@ -21,7 +22,9 @@ #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { DEFINE_GET_PER_BLOCK_FUNC_PTR(/*HIDDEN_IN=*/5120, /*ALIGNED=*/true); @@ -186,4 +189,6 @@ void llama4_fp8_bf16_gemm_op(void const* A, void const* B, void* C, void const* } } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h index 709d56d3bf..35297bde38 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h @@ -16,15 +16,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { void llama4_fp8_bf16_gemm_op(void const* A, void const* B, void* C, void const* scaling_factor, void const* pos_ids, bool pos_ids_int64, int num_tokens, int hidden_in, int hidden_out, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh index b330908d09..56ed6e4b0d 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh @@ -16,13 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT. @@ -357,4 +360,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_ DISPATCH_PER_BLOCK_FC_FP8_BF16_ATTN_SCALING_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED, POS_IDS_INT64); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh index eac5a41399..618a0aea0b 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh @@ -16,13 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT. @@ -297,4 +300,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker DISPATCH_PER_BLOCK_FC_FP8_BF16_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh index 592995dc4a..2172acde74 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh @@ -16,13 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT / WARP_PER_BLOCK. @@ -323,4 +326,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern DISPATCH_PER_WARP_FC_FP8_BF16_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu index 9f7b897043..6b0c988383 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu @@ -14,13 +14,16 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu { DEFINE_GET_FUNC_PTR(5120, true); @@ -236,4 +239,6 @@ void llama4_fp8_fp8_gemm_swiglu_op(int num_tokens, int hidden_in, int hidden_out A, B, C, in_scale, out_scale_inv, num_tokens, hidden_in, hidden_out, tactic.first, tactic.second, stream); } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +} // namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h index aa11c4485d..f202578301 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h @@ -16,16 +16,21 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu { void llama4_fp8_fp8_gemm_swiglu_op(int num_tokens, int hidden_in, int hidden_out, void const* A, void const* B, void* C, void const* in_scale, void const* out_scale_inv, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +} // namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh index e0a459656b..d6923c4afd 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh @@ -16,11 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT. @@ -337,4 +340,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo DISPATCH_FC_FP8_BF16_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +} // namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu index fd4b29fd65..87b8e0d16c 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include @@ -33,7 +34,9 @@ #define ENABLE_PREFETCH 1 #define ENABLE_PREEXIT 1 -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_moe { #define TOPK_VEC_SIZE 4 @@ -351,4 +354,6 @@ void run_moe_llama4_tp8ep1_min_latency(int num_tokens, int num_experts, exp_idx, output_void, dequant_fc2, stream); } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +} // namespace kernels::llama4_min_latency::llama4_moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h index 7d0d52c683..2cac832b39 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h @@ -15,12 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_moe { // Launch moe_mlp_fc13_swiglu_fp8_5120 and moe_fc_fp8_bf16_1024. @@ -37,4 +40,6 @@ void run_moe_llama4_tp8ep1_min_latency(int num_tokens, int num_experts, void* __restrict__ output_void, // FC2 output tensor BF16 [num_tokens][HIDDEN_SIZE] cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +} // namespace kernels::llama4_min_latency::llama4_moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh index de5df85da2..0e01146990 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh @@ -16,11 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/common/envUtils.h" -namespace tensorrt_llm::kernels::llama4_min_latency +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency { namespace llama4_bf16_bf16_gemm @@ -119,4 +122,6 @@ struct __align__(8) aligned_bfloat16x4 __align__(8) __nv_bfloat16 data[4]; }; -} // namespace tensorrt_llm::kernels::llama4_min_latency +} // namespace kernels::llama4_min_latency + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/logitsBitmask.cu b/cpp/tensorrt_llm/kernels/logitsBitmask.cu index 084e660cc7..ac66967e0f 100644 --- a/cpp/tensorrt_llm/kernels/logitsBitmask.cu +++ b/cpp/tensorrt_llm/kernels/logitsBitmask.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/logitsBitmask.h" using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace @@ -330,4 +331,5 @@ template void invokeContiguousLogitsBitmask<__nv_bfloat16>(__nv_bfloat16* logits #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/logitsBitmask.h b/cpp/tensorrt_llm/kernels/logitsBitmask.h index 942f8acada..e2e6cb28cd 100644 --- a/cpp/tensorrt_llm/kernels/logitsBitmask.h +++ b/cpp/tensorrt_llm/kernels/logitsBitmask.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeContiguousLogitsBitmask(T* logits, uint32_t const* bitmask, int32_t c int32_t batchSize, int32_t vocabSizePadded, int32_t bitmaskSize, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lookupKernels.cu b/cpp/tensorrt_llm/kernels/lookupKernels.cu index 1ae2ed8258..f1435bf0d3 100644 --- a/cpp/tensorrt_llm/kernels/lookupKernels.cu +++ b/cpp/tensorrt_llm/kernels/lookupKernels.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/kernels/lookupKernels.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { /* When running with multiple GPUs, we split the embedding lookup table across multiple GPUs to save the memory @@ -92,4 +93,5 @@ INSTANTIATE_LOOK_UP(__nv_bfloat16, int8_t, int); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lookupKernels.h b/cpp/tensorrt_llm/kernels/lookupKernels.h index ac5f3f4a77..9dc5ba4886 100644 --- a/cpp/tensorrt_llm/kernels/lookupKernels.h +++ b/cpp/tensorrt_llm/kernels/lookupKernels.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template @@ -30,4 +31,5 @@ void invokeLookUp(Tout* out, Idx const* input, Tin const* weight, int64_t const Idx const size, Idx const n_embed, Tout const* perTokenScales, cudaStream_t stream = 0); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lora/dora.h b/cpp/tensorrt_llm/kernels/lora/dora.h index b8e763f5d3..fc21fe6693 100644 --- a/cpp/tensorrt_llm/kernels/lora/dora.h +++ b/cpp/tensorrt_llm/kernels/lora/dora.h @@ -15,10 +15,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class DoraImpl { @@ -40,4 +43,6 @@ private: std::vector mHostBuf; nvinfer1::DataType mType; }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lora/lora.cpp b/cpp/tensorrt_llm/kernels/lora/lora.cpp index 67e774f60c..167826be62 100644 --- a/cpp/tensorrt_llm/kernels/lora/lora.cpp +++ b/cpp/tensorrt_llm/kernels/lora/lora.cpp @@ -15,18 +15,21 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/lora/lora.h" - #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/groupGemm.h" +#include "tensorrt_llm/kernels/lora/lora.h" #include "tensorrt_llm/kernels/splitkGroupGemm.h" #include "tensorrt_llm/runtime/iBuffer.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { // TODO should reuse the function in gemmPlugin @@ -339,4 +342,6 @@ int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* inp return impl->run(numTokens, numReqs, input, loraRanks, loraWeightsPtr, weightIndex, outputs, workspace, stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lora/lora.h b/cpp/tensorrt_llm/kernels/lora/lora.h index 38437b5348..7215a7af74 100644 --- a/cpp/tensorrt_llm/kernels/lora/lora.h +++ b/cpp/tensorrt_llm/kernels/lora/lora.h @@ -17,13 +17,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { using CublasGemmWrapper = tensorrt_llm::common::CublasMMWrapper; @@ -70,4 +73,6 @@ private: int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks, void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lruKernel.cu b/cpp/tensorrt_llm/kernels/lruKernel.cu index a0fc4fdb84..731ccb016e 100644 --- a/cpp/tensorrt_llm/kernels/lruKernel.cu +++ b/cpp/tensorrt_llm/kernels/lruKernel.cu @@ -27,12 +27,13 @@ #endif #include "lruKernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -437,4 +438,5 @@ INSTANTIATE_RGLRU_UPDATE_DATA_TYPE(__nv_bfloat16); #undef INSTANTIATE_RGLRU_UPDATE_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lruKernel.h b/cpp/tensorrt_llm/kernels/lruKernel.h index c49f039d48..a0f31bbea5 100644 --- a/cpp/tensorrt_llm/kernels/lruKernel.h +++ b/cpp/tensorrt_llm/kernels/lruKernel.h @@ -17,9 +17,10 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -56,4 +57,5 @@ template void invokeRGLRUUpdate(lruParams& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu index 8e58d80ffa..e7489b29cf 100644 --- a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu +++ b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu @@ -26,6 +26,7 @@ #include "mambaConv1dKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" @@ -97,8 +98,8 @@ __device__ static inline void cp_wait_group() #endif } -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1318,4 +1319,5 @@ template void invokeMambaConv1dGeneration<__nv_bfloat16>(MambaConv1dParamsBase& #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h index 4fb0d2dec4..2c7eadc5b0 100644 --- a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h +++ b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h @@ -17,10 +17,11 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -48,4 +49,5 @@ template void invokeMambaConv1dGeneration(MambaConv1dParamsBase& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu index 97fd88a50e..cc06fe4bc1 100644 --- a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu +++ b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu @@ -16,6 +16,7 @@ #include "mlaChunkedPrefill.cuh" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/mathUtils.h" #include @@ -290,8 +291,8 @@ __global__ void loadChunkedKVCacheForMLAKernel(T* output_kv_ptr, T* output_k_pe_ } // namespace -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -351,4 +352,5 @@ INSTANTIATE_MLA_CHUNKED_PREFILL_KERNEL(half); INSTANTIATE_MLA_CHUNKED_PREFILL_KERNEL(float); INSTANTIATE_MLA_CHUNKED_PREFILL_KERNEL(__nv_bfloat16); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh index 551e6d79a5..84ff1821e2 100644 --- a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh +++ b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { // merged_attn [q_total_len, H=128, D=128] (T) @@ -38,4 +39,5 @@ void invokeMLALoadChunkedKV(T* output_kv_ptr, T* output_k_pe_ptr, KVBlockArray c int64_t const* cu_ctx_chunked_len, int64_t const* chunked_ld_global_offset, int lora_size, int rope_size, int max_seq_len, float const* kv_scale_quant_orig_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu index d678cbe082..8acd92a3c6 100644 --- a/cpp/tensorrt_llm/kernels/mlaKernels.cu +++ b/cpp/tensorrt_llm/kernels/mlaKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" @@ -31,8 +32,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1139,4 +1140,4 @@ INSTANTIATE_RW_KVCACHE_MLA(__nv_bfloat16, __nv_fp8_e4m3); } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.h b/cpp/tensorrt_llm/kernels/mlaKernels.h index ce6f4b1bfa..de458857bd 100644 --- a/cpp/tensorrt_llm/kernels/mlaKernels.h +++ b/cpp/tensorrt_llm/kernels/mlaKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -133,4 +134,5 @@ void invokeMLARopeAppendPagedKVAssignQ(KVBlockArray& kv_cache, T* q_ptr, T* late float2 const* cos_sin_cache, size_t head_num, int nope_size, int rope_size, int lora_size, float const* kv_scale_orig_quant_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeAlignKernels.cu b/cpp/tensorrt_llm/kernels/moeAlignKernels.cu index ae54aa5f4c..4cb4cfb2f0 100644 --- a/cpp/tensorrt_llm/kernels/moeAlignKernels.cu +++ b/cpp/tensorrt_llm/kernels/moeAlignKernels.cu @@ -18,13 +18,17 @@ #include "moeAlignKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/kernels/moeCommKernelsCommon.h" #include #define CEILDIV(x, y) (((x) + (y) -1) / (y)) #define WARP_SIZE 32 -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -277,4 +281,6 @@ void invokeMoeAlignBlockSize(void const* topk_ids, int32_t topk_ids_dtype_size, } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeAlignKernels.h b/cpp/tensorrt_llm/kernels/moeAlignKernels.h index 1cf048858d..0f730271d0 100644 --- a/cpp/tensorrt_llm/kernels/moeAlignKernels.h +++ b/cpp/tensorrt_llm/kernels/moeAlignKernels.h @@ -16,10 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { /** @@ -43,4 +46,6 @@ void invokeMoeAlignBlockSize(void const* topk_ids, int32_t topk_ids_dtype_size, int32_t* expert_ids, int32_t* num_tokens_post_pad, int32_t num_experts, int32_t block_size, int32_t numel, int32_t max_num_tokens_padded, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h b/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h index 7d4310764b..0993c987e6 100644 --- a/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h +++ b/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h @@ -15,10 +15,11 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -44,4 +45,5 @@ struct MoeExpertParallelInfo }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h index d3e8063a04..7c8aa86c22 100644 --- a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h +++ b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h @@ -16,8 +16,9 @@ */ #pragma once -namespace tensorrt_llm -{ +#include "tensorrt_llm/common/config.h" +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -89,4 +90,5 @@ struct MoePlacementInfo }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu index 6c3440e9a2..1f5a9bb8e5 100644 --- a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu +++ b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include @@ -24,8 +25,8 @@ namespace cg = cooperative_groups; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -665,4 +666,5 @@ void moeSetSignalForGpuStageHost(MoeLoadBalanceSingleLayerSignal* signal, int64_ } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h index 85acd1fb68..29c6ed5373 100644 --- a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h +++ b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h @@ -16,10 +16,11 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -133,4 +134,5 @@ void moeWaitSignalForCpuStageHost(MoeLoadBalanceSingleLayerSignal* signal); void moeSetSignalForGpuStageHost(MoeLoadBalanceSingleLayerSignal* signal, int64_t iterId, bool enableStatistic); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.cu b/cpp/tensorrt_llm/kernels/moePrepareKernels.cu index b401746857..f657f60086 100644 --- a/cpp/tensorrt_llm/kernels/moePrepareKernels.cu +++ b/cpp/tensorrt_llm/kernels/moePrepareKernels.cu @@ -15,6 +15,7 @@ */ #include "moePrepareKernels.h" +#include "tensorrt_llm/common/config.h" #include @@ -24,7 +25,9 @@ namespace cg = cooperative_groups; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace moe_prepare @@ -374,4 +377,6 @@ size_t getMoePrepareWorkspaceSize(int epSize) } // namespace moe_prepare -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.h b/cpp/tensorrt_llm/kernels/moePrepareKernels.h index c7a095e394..ef33b4c6af 100644 --- a/cpp/tensorrt_llm/kernels/moePrepareKernels.h +++ b/cpp/tensorrt_llm/kernels/moePrepareKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/common/cudaUtils.h" @@ -23,7 +24,9 @@ #define DEBUG_PIPELINE 0 -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace moe_prepare @@ -87,4 +90,6 @@ size_t getMoePrepareWorkspaceSize(int epSize); } // namespace moe_prepare -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh index 665086c7dc..c94ff267e5 100644 --- a/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh +++ b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh @@ -17,6 +17,7 @@ #pragma once #ifndef TRTLLM_MOETOPKFUNCS_CUH_H #define TRTLLM_MOETOPKFUNCS_CUH_H +#include "tensorrt_llm/common/config.h" #include #include @@ -24,7 +25,9 @@ #include "tensorrt_llm/kernels/archCondition.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace reduce_topk @@ -281,5 +284,7 @@ __forceinline__ __device__ void reduceTopK(cg::thread_block_tile con #undef TOPK_SWAP } // namespace reduce_topk -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END #endif // TRTLLM_MOETOPKFUNCS_CUH_H diff --git a/cpp/tensorrt_llm/kernels/moe_utils.cuh b/cpp/tensorrt_llm/kernels/moe_utils.cuh index ad8fce9fbd..bf35db9bbd 100644 --- a/cpp/tensorrt_llm/kernels/moe_utils.cuh +++ b/cpp/tensorrt_llm/kernels/moe_utils.cuh @@ -17,8 +17,9 @@ #pragma once -namespace tensorrt_llm -{ +#include "tensorrt_llm/common/config.h" +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -45,4 +46,5 @@ __device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices, i } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h b/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h index a3363388f3..74c27759d7 100644 --- a/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h +++ b/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -122,4 +123,5 @@ static constexpr int kIdxScaleSoftmaxPtr = 0; static constexpr int kIdxScaleSoftmaxLog2Ptr = 1; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu index b132a54b5f..19eb4be4c1 100644 --- a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu +++ b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu @@ -16,6 +16,7 @@ */ #include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/noAuxTcKernels.h" @@ -26,7 +27,9 @@ namespace cg = cooperative_groups; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static constexpr int WARP_SIZE = 32; static constexpr int NumKimiK2Experts = 384; @@ -334,4 +337,6 @@ INSTANTIATE_NOAUX_TC(__nv_bfloat16, float, __nv_bfloat16, int32_t); INSTANTIATE_NOAUX_TC(__nv_bfloat16, half, __nv_bfloat16, int32_t); #endif -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/noAuxTcKernels.h b/cpp/tensorrt_llm/kernels/noAuxTcKernels.h index e79ceee4f4..dfe6908723 100644 --- a/cpp/tensorrt_llm/kernels/noAuxTcKernels.h +++ b/cpp/tensorrt_llm/kernels/noAuxTcKernels.h @@ -17,12 +17,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -30,4 +33,6 @@ void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk int64_t const num_experts, int64_t const n_group, int64_t const topk_group, int64_t const topk, double const routed_scaling_factor, cudaStream_t const stream = 0); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/penaltyKernels.cu b/cpp/tensorrt_llm/kernels/penaltyKernels.cu index a85f174208..08154c70c8 100644 --- a/cpp/tensorrt_llm/kernels/penaltyKernels.cu +++ b/cpp/tensorrt_llm/kernels/penaltyKernels.cu @@ -14,11 +14,12 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/penaltyKernels.h" - +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" + #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/decodingCommon.h" +#include "tensorrt_llm/kernels/penaltyKernels.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include @@ -27,7 +28,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __device__ bool almostEqual(float a, float b, float epsilon) @@ -262,4 +265,6 @@ template void invokeBatchApplyPenalty(InvokeBatchApplyPenaltyParams const template void invokeBatchApplyPenalty(InvokeBatchApplyPenaltyParams const& params); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/penaltyKernels.h b/cpp/tensorrt_llm/kernels/penaltyKernels.h index c6ab87951d..b8f2309957 100644 --- a/cpp/tensorrt_llm/kernels/penaltyKernels.h +++ b/cpp/tensorrt_llm/kernels/penaltyKernels.h @@ -15,12 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -57,4 +60,6 @@ struct InvokeBatchApplyPenaltyParams template void invokeBatchApplyPenalty(InvokeBatchApplyPenaltyParams const& params); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/penaltyTypes.h b/cpp/tensorrt_llm/kernels/penaltyTypes.h index 79ab634967..e8d8a9201b 100644 --- a/cpp/tensorrt_llm/kernels/penaltyTypes.h +++ b/cpp/tensorrt_llm/kernels/penaltyTypes.h @@ -17,13 +17,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -56,4 +57,5 @@ inline std::pair getLimitsPenalty(DecodingPenaltyType penaltyType) return std::make_pair(fltMin, fltMax); } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu index 1219d371f8..ede009307e 100644 --- a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu +++ b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu @@ -14,11 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/moe_utils.cuh" #include "tensorrt_llm/kernels/preQuantScaleKernel.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace @@ -206,4 +207,5 @@ INSTANTIATE_PREQUANT_SCALE_PER_EXPERT(__nv_bfloat16, __nv_fp8_e4m3); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h index 47183b79be..8d4a9eef77 100644 --- a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h +++ b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -30,8 +31,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -45,4 +46,5 @@ void apply_per_channel_scale_per_expert_kernel_launcher(T_out* smoothed_act, T_i int const num_experts_per_node, int64_t const* num_valid_tokens_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/qserveGemm.h b/cpp/tensorrt_llm/kernels/qserveGemm.h index e5aa0bdb31..f9b374067e 100644 --- a/cpp/tensorrt_llm/kernels/qserveGemm.h +++ b/cpp/tensorrt_llm/kernels/qserveGemm.h @@ -17,10 +17,11 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace qserve @@ -71,4 +72,5 @@ public: } // namespace qserve } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu b/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu index 23432cb030..d7fa4939f3 100644 --- a/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu +++ b/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu @@ -22,11 +22,12 @@ // } #include "qserveGemm.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace qserve @@ -605,4 +606,5 @@ void QServeGemmRunner::gemmPerChannel(ParamsPerChannel const& params, cudaStream } // namespace qserve } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu b/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu index 4ffebc2e27..e2f25c57ba 100644 --- a/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu +++ b/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu @@ -21,11 +21,11 @@ // } #include "qserveGemm.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -663,4 +663,5 @@ size_t QServeGemmRunner::getWorkspaceSize(int const m, int const n, int const k) } // namespace qserve } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index 78248214c1..3941277dfa 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -26,8 +27,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -429,4 +430,5 @@ template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int b, int m, int n, __nv #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 665ec2b42e..5a645e36f1 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantTypeUtils.cuh" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -902,4 +903,5 @@ quantize_with_block_size( __global__ void block_scale_interleave_kernel( int numbatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/quantization.h b/cpp/tensorrt_llm/kernels/quantization.h index 70776b2790..e571a40a16 100644 --- a/cpp/tensorrt_llm/kernels/quantization.h +++ b/cpp/tensorrt_llm/kernels/quantization.h @@ -15,13 +15,12 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include #include -namespace tensorrt_llm -{ - +TRTLLM_NAMESPACE_BEGIN enum class QuantizationSFLayout { // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor @@ -93,4 +92,5 @@ void computePerTokenGlobalScaleForFP4Quantization(int b, int m, int n, T const* float* globalScale, int multiProcessorCount, cudaStream_t stream = 0); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu index 050f99efda..b2355aa8d8 100644 --- a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu +++ b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/kernels/recoverFromRingAtten.h" @@ -23,8 +24,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -138,4 +139,5 @@ INSTANTIATE_RECOVER_RA(half); INSTANTIATE_RECOVER_RA(__nv_bfloat16); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h index 86ca60c2ab..9d433d0714 100644 --- a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h +++ b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -31,4 +32,5 @@ void invokeRecoverFromRA(Tout* accu_output, float* accu_softmax_stats, Tout* out int h, int d, int* cu_seqlens, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/rmsnormKernels.cu b/cpp/tensorrt_llm/kernels/rmsnormKernels.cu index c30280bf0d..8dfb6e6ade 100644 --- a/cpp/tensorrt_llm/kernels/rmsnormKernels.cu +++ b/cpp/tensorrt_llm/kernels/rmsnormKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/quantTypeUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -21,8 +22,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -283,4 +284,5 @@ INSTANTIATE_GENERAL_RMSNORM(__nv_bfloat16, __nv_fp8_e4m3); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/rmsnormKernels.h b/cpp/tensorrt_llm/kernels/rmsnormKernels.h index df3ca6f665..fca852c898 100644 --- a/cpp/tensorrt_llm/kernels/rmsnormKernels.h +++ b/cpp/tensorrt_llm/kernels/rmsnormKernels.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeGeneralRmsNorm(T* out, T const* input, T const* gamma, T const* beta, float* sum_per_token = nullptr, QuantT* out_quant = nullptr); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu index e45a7bb97f..fceea61041 100644 --- a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -610,4 +611,5 @@ void unpadding( } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/sageAttentionKernels.h b/cpp/tensorrt_llm/kernels/sageAttentionKernels.h index c2039206a5..4ef82e5b15 100644 --- a/cpp/tensorrt_llm/kernels/sageAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/sageAttentionKernels.h @@ -15,13 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template = 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/samplingTopPKernels.h" @@ -35,8 +37,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1466,4 +1468,5 @@ template size_t getAirTopPWorkspaceSize(int32_t batchSize, int32_t vocabSi template uint32_t calcAirTopPBlockNum(int batchSize, int len, int smCnt, bool isDeterministic); template uint32_t calcAirTopPBlockNum(int batchSize, int len, int smCnt, bool isDeterministic); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index 1c5d8446de..c175e708fb 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -19,10 +19,12 @@ #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -31,7 +33,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -471,4 +475,6 @@ void invokeSetupTopKTopPRuntimeArgs(SizeType32 batchSize, ScatterDecodingParamEn } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h index ace034dc43..cb7f835f4d 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h @@ -17,12 +17,15 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static constexpr runtime::SizeType32 TOP_K_MAX = 1024; @@ -302,4 +305,6 @@ __device__ __host__ inline void setupTopKTopPRuntimeArgOne(runtime::SizeType32 b } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu index 5c9e6945c9..d7a8d66ecf 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu @@ -17,10 +17,12 @@ #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -30,7 +32,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __global__ void topPInitialize(TokenIdType* topPIdValBuf, SizeType32* topPOffsetBuf, SizeType32* beginTopPOffsetBuf, SizeType32 batchSize, SizeType32 vocabSize) @@ -515,4 +519,6 @@ void invokeSetTopPRuntimeArgs(SizeType32 batchSize, ScatterDecodingParamEntry -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template struct TopPSamplingKernelParams @@ -188,4 +191,6 @@ void invokeSetTopPRuntimeArgs(runtime::SizeType32 batchSize, ScatterDecodingPara ScatterDecodingParamEntry topP, bool* skipDecodePtr, float* initialTopPPtr, runtime::SizeType32 const* batchSlotsPtr, bool onDevice, cudaStream_t stream = nullptr); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h b/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h index ea4f052032..0d8226026e 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -27,8 +28,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -896,6 +897,6 @@ static inline BmmChunkKernelFunc getBmmChunkKernel(int B_, int L_, int H_, int P } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h b/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h index 30a1a2c5f9..0b990e5942 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -25,8 +26,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -361,6 +362,6 @@ static inline ChunkCumsumKernelFunc getChunkCumsumKernel(int B_, int L_, int H_, } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h b/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h index cc81fb5094..3360560c6f 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -27,8 +28,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2285,6 +2286,6 @@ static inline ChunkScanKernelFunc getChunkScanKernel(int B_, int L_, int H_, int } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h b/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h index 1664f0062c..66c0826a69 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -27,8 +28,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2260,6 +2261,6 @@ static inline ChunkStateKernelFunc getChunkStateKernel(int B_, int L_, int H_, i } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu index 4d8acd59de..935cbd3743 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu @@ -15,15 +15,16 @@ */ #include "../bmmchunk.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetBmmChunkKernelFunc getBmmChunkKernel_bf16 = getBmmChunkKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu index 096a2fec11..4b24405a47 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu @@ -15,15 +15,16 @@ */ #include "../bmmchunk.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetBmmChunkKernelFunc getBmmChunkKernel_fp16 = getBmmChunkKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu index 43fda3c64a..25b8cea5da 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_bf16_bf16 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu index ab7c214f8e..6dce67340f 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_bf16_fp32 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu index bf3c78a9c3..c008cbec65 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_fp16_fp16 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu index 30c6ac7266..18ca02aad4 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_fp16_fp32 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu index ac12abea52..0cae8b68ac 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_bf16_bf16 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu index 2c85472a0d..b91a175a09 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_bf16_fp32 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu index 8c330cf815..bf5f7d21a5 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_fp16_fp16 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu index 7c4f11af70..e65f40073e 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_fp16_fp32 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu index 7f7e224f2b..98bdcacd8c 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu @@ -15,15 +15,16 @@ */ #include "../chunkstate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkStateKernelFunc getChunkStateKernel_bf16 = getChunkStateKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu index 7c247c5b32..32a70b8698 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu @@ -15,15 +15,16 @@ */ #include "../chunkstate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkStateKernelFunc getChunkStateKernel_fp16 = getChunkStateKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu index c62ea0c9be..968adee38a 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu @@ -15,15 +15,16 @@ */ #include "../statepassing.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetStatePassingKernelFunc getStatePassingKernel_bf16 = getStatePassingKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu index 0627699fda..f3f9e00224 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu @@ -15,15 +15,16 @@ */ #include "../statepassing.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetStatePassingKernelFunc getStatePassingKernel_fp16 = getStatePassingKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu index 28b7cc5198..8f0a323304 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include @@ -36,8 +37,8 @@ #include "chunkstate.h" #include "statepassing.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -628,4 +629,5 @@ INSTANTIATE_SELECTIVE_SCAN_UPDATE_DATA_TYPE(__nv_bfloat16, float); #undef INSTANTIATE_SELECTIVE_SCAN_UPDATE_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h index 493d56bc5e..88f28b991b 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h @@ -31,11 +31,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -80,4 +81,5 @@ void invokeChunkScan(SSMParamsBase& params, cudaStream_t stream, tensorrt_llm::c template void invokeSelectiveScanUpdate(SSMParamsBase& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h b/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h index 36dbe526fd..a94dd5c363 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -25,8 +26,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -265,6 +266,6 @@ static inline StatePassingKernelFunc getStatePassingKernel(int B_, int L_, int H } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu index 4d305467b6..6d3fe898d1 100644 --- a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu @@ -13,11 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/sparseAttentionKernels.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template @@ -199,4 +200,5 @@ void invokeGatherKvPageOffsets(int32_t* output_kv_page_offsets, int32_t* output_ kv_page_offsets, seq_lengths, sparse_params, batch_size, tokens_per_page, max_num_pages_per_seq); } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h index 29487567d2..6c701a6861 100644 --- a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h @@ -15,14 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -82,4 +83,5 @@ void invokeGatherKvPageOffsets(int32_t* output_kv_page_offsets, // [num_head_kv, int32_t const tokens_per_page, int32_t const max_num_pages_per_seq, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu index d474742bbb..4ff9159864 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" @@ -35,7 +36,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { template __global__ void packAcceptedPaths(SizeType32* acceptedLengthsCumSum, SizeType32* pathsOffsets, @@ -485,4 +488,6 @@ template size_t getTypicalAcceptanceWorkspaceSize( template size_t getTypicalAcceptanceWorkspaceSize( SizeType32 batchSize, SizeType32 maxDecodingTokens, SizeType32 vocabSizePadded); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h index 8da35fb054..bedf152e44 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h @@ -17,13 +17,16 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief Linearly packs accepted paths in memory according to the accceptedLengths and bestPathIds @@ -205,4 +208,6 @@ template size_t getTypicalAcceptanceWorkspaceSize( runtime::SizeType32 batchSize, runtime::SizeType32 maxDecodingTokens, runtime::SizeType32 vocabSizePadded); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu index 88e6ea977b..7788dc6134 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu @@ -26,14 +26,15 @@ #include "draftTokenTreeKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -99,4 +100,5 @@ void invokeExtractRealDraftTokens(ExtractRealDraftTokensParam& params, cudaStrea } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h index be660e554a..67a28e5e2e 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h @@ -21,12 +21,12 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ -// namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -51,4 +51,4 @@ void invokeExtractRealDraftTokens(ExtractRealDraftTokensParam& params, cudaStrea } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu index e963033855..d707d286f5 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu @@ -15,11 +15,13 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h" + #include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -32,7 +34,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { namespace { @@ -2321,4 +2325,6 @@ void invokeCopyFinalDraftTokens(SizeType32 batchSize, SizeType32 maxDecodingDraf sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h index 7a8b97f679..9cc639917f 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief Sets pointers to logits in logitsPtrs according to the draftDecodingTokens. @@ -782,4 +785,6 @@ void invokeCopyFinalDraftTokens(runtime::SizeType32 batchSize, runtime::SizeType runtime::TokenIdType const* const* thirdTopKOutputIdsPtrs, runtime::TokenIdType* pluginOutputAllLayersDraftTokenIds, runtime::TokenIdType* pluginOutputDraftTokenIds, runtime::SizeType32* pluginOutputDraftLens, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu index 27f89b8074..eaab2215f1 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu @@ -14,7 +14,9 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/memoryUtils.h" + #include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -30,7 +32,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { size_t invokeScanGenerationLengths(void* __restrict__ scanTempStorage, size_t scanTempStorageBytes, SizeType32 const* __restrict__ generationLengths, SizeType32* __restrict__ scannedGenerationLengths, @@ -636,4 +640,6 @@ template void invokeCopyProbs(PackExplicitDraftTokensParams const& params, template void invokeCopyProbs(PackExplicitDraftTokensParams<__nv_bfloat16> const& params, cudaStream_t stream); #endif // ENABLE_BF16 -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h index d2ab345cd4..9b56f344c3 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h @@ -17,12 +17,15 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/common.h" #include #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { template @@ -374,4 +377,6 @@ void invokeConvertMaskToPackedMask(runtime::SizeType32 batchSize, runtime::SizeType32 const* __restrict__ batchSlots, runtime::SizeType32 maxDraftTokens, runtime::SizeType32 maxGenerationLength, runtime::SizeType32* __restrict__ packedMask, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu index d0da906b8b..2f5eeb2c0a 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu @@ -15,10 +15,12 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" + #include "tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -31,7 +33,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { namespace { @@ -314,4 +318,6 @@ void invokeForwardAcceptedTokens(SizeType32 batchSize, SizeType32 const* batchSl sync_check_cuda_error(stream); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h index 1dcb8f32b6..92fb3f6898 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief Accepts or rejects draft tokens based on their probability distributions or the equality of draft and target @@ -95,4 +98,6 @@ void invokeForwardAcceptedTokens(runtime::SizeType32 batchSize, runtime::SizeTyp runtime::TokenIdType** idsPtrs, runtime::SizeType32 step, runtime::SizeType32 maxDraftTokens, runtime::TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu index 2cb22314e2..8d1ca4530d 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu @@ -15,6 +15,7 @@ */ #include "kvCacheUpdateKernels.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" @@ -22,7 +23,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { using namespace tensorrt_llm::runtime; @@ -334,4 +337,6 @@ void updateKVBlockArrayDraftTokenLocationSeparateRewind(SizeType32 const* seqAcc canUseOneMoreBlock, stream); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h index 69643b0098..f8551db9b7 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h @@ -16,12 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { using IndexType = int; @@ -205,4 +208,6 @@ void updateKVBlockArrayDraftTokenLocation(runtime::SizeType32 const* seqAccepted runtime::SizeType32 maxKVCacheLen, runtime::SizeType32 maxBlocksPerSeq, runtime::SizeType32 tokensPerBlock, bool canUseOneMoreBlock, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu index 8db96a37d5..c109f28e9a 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu @@ -15,10 +15,12 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" + #include "tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -31,7 +33,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { namespace { @@ -62,4 +66,6 @@ void scatterMedusaDraftTokens(TokenIdType* treeDraftIds, TokenIdType const* sour scatterMedusaDraftTokens<<>>( treeDraftIds, sourceDraftIds, treeIds, tokensPerStep, batchSlots, maxDecodingTokens); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h index a284fb16ca..8e79aa653e 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief assembles draft tokens to treeDraftIds from sourceDraftIds using indices of treeIds @@ -45,4 +48,6 @@ void scatterMedusaDraftTokens(runtime::TokenIdType* treeDraftIds, runtime::Token runtime::SizeType32 const* treeIds, runtime::SizeType32 const* tokensPerStep, runtime::SizeType32 const* batchSlots, runtime::SizeType32 maxDecodingTokens, runtime::SizeType32 batchSize, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu index 2e370a4900..eb72d69d49 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu @@ -26,13 +26,14 @@ #include "mtpKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -561,4 +562,5 @@ template void invokeMTPRelaxedAcceptance<__nv_bfloat16>(MTPRelaxedAcceptancePara #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h index e19908101f..4beeac53ba 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h @@ -17,15 +17,16 @@ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + // namespace tensorrt_llm::kernels namespace kernels { @@ -115,4 +116,4 @@ void invokeMTPRelaxedAcceptance(MTPRelaxedAcceptanceParam& params, cudaStream_t } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu b/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu index e6f6f55f92..6397396ea6 100644 --- a/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu +++ b/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu @@ -21,15 +21,18 @@ #include "cutlass/gemm/device/gemm_universal.h" #include "cutlass/gemm/gemm.h" +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/cudaUtils.h" + +#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/splitk_gemm_grouped.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h" -#include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm::kernels +namespace kernels { int64_t inline getGemmCoordSize(int64_t problemCount) @@ -288,4 +291,6 @@ void splitkGroupedGemm(std::vector const& problemSizes } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/splitkGroupGemm.h b/cpp/tensorrt_llm/kernels/splitkGroupGemm.h index 8d7af7e4bf..6ada825529 100644 --- a/cpp/tensorrt_llm/kernels/splitkGroupGemm.h +++ b/cpp/tensorrt_llm/kernels/splitkGroupGemm.h @@ -16,10 +16,13 @@ #pragma once #include "cutlass/gemm_coord.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { int64_t getSplitkGroupedGemmParamsWorkSpaceSize(int64_t problem_count); @@ -29,4 +32,6 @@ void splitkGroupedGemm(std::vector const& problem_size void* gemmParamsWorkspace, int64_t gemmParamsWorkSpaceSize, void* gemmWorkSpace, int64_t gemmWorkspaceSize, bool isLoraIn, nvinfer1::DataType dataType, int splitKSlices, int minKN, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu index 088e5aff79..ad2e904411 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/stopCriteriaKernels.h" @@ -21,8 +22,8 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -249,4 +250,5 @@ void invokeExplicitEOSCriterion(TokenIdType const** outputIds, TokenIdType const } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h index f60ac784e7..dee64cabca 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h @@ -15,12 +15,13 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { //! \brief Sets finished state to FinishedState::FINISHED_STOP_WORDS if any of the stopWords is met. @@ -95,4 +96,5 @@ void invokeExplicitEOSCriterion(runtime::TokenIdType const** outputIds, runtime: runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxTokensPerStep, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index e6e4e82c92..3d6e2b730a 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -20,6 +20,7 @@ * introduced in https://dl.acm.org/doi/pdf/10.1145/3581784.3607062 . * Another variant can be found in TopP sampling: cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu . */ +#include "tensorrt_llm/common/config.h" #include #include "moeTopKFuncs.cuh" @@ -34,8 +35,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { using SizeType32 = tensorrt_llm::runtime::SizeType32; @@ -1696,4 +1697,5 @@ INSTANTIATE_TOPK_LastDim_DATA_TYPE(__nv_bfloat16); #undef INSTANTIATE_TOPK_LastDim_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.h b/cpp/tensorrt_llm/kernels/topkLastDim.h index 31f9a12420..08379da40f 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.h +++ b/cpp/tensorrt_llm/kernels/topkLastDim.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -35,4 +36,5 @@ void invokeTopkLastDim(runtime::SizeType32 batchSize, runtime::SizeType32 inputL void* workspace, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp index f3b6decd38..b3d1e3a721 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp @@ -19,16 +19,17 @@ #include #include "KernelRunner.h" -#include "tensorrt_llm/common/assert.h" #include "trtllmGen_bmm_export/BatchedGemmInterface.h" #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h" // DO NOT include cudaUtils.h and logger.h before BatchedGemmInterface.h as it #undef TLLM_LOG_INFO and co. +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -680,4 +681,5 @@ bool TrtllmGenBatchedGemmRunner::isValidConfigIndex(int32_t configIndex, int32_t } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h index 959c500fb2..0cbfa8ef57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -126,4 +127,5 @@ private: std::vector mPassingConfigIndices; }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 1b1ab14a2c..5da0e0f043 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include #include diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu index ba5821a8d2..d348d95cb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu @@ -17,14 +17,15 @@ #include "DevKernel.h" #include "RoutingKernel.h" #include "runner.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace trtllmGenFp8BlockScaleMoe @@ -599,4 +600,5 @@ void Runner::run( } // namespace trtllmGenFp8BlockScaleMoe } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h index 4edad536b5..987b953ee3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h @@ -18,6 +18,7 @@ #include "DevKernel.h" #include "RoutingKernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h" @@ -26,8 +27,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace trtllmGenFp8BlockScaleMoe @@ -396,4 +397,5 @@ private: } // namespace trtllmGenFp8BlockScaleMoe } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index 899769309a..e47ea6c668 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -17,3908 +17,14 @@ #pragma once #include "../kernelParams.h" +#include "tensorrt_llm/common/config.h" + +#include + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { -// clang-format off - -#define TLLM_GEN_VERSION "10a85386-dirty" -#ifndef EXCLUDE_SM_100 -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -#endif // EXCLUDE_SM_100 - -#ifndef EXCLUDE_SM_100 -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -#endif // EXCLUDE_SM_100 - struct TllmGenFmhaKernelMetaInfo { @@ -3933,9 +39,9 @@ struct TllmGenFmhaKernelMetaInfo int mHeadDimQk; int mHeadDimV; int mSM; - const unsigned char* mCubin; + unsigned char const* mCubin; unsigned int mCubinSize; - const char* mFuncName; + char const* mFuncName; int mSharedMemBytes; int mThreadsPerCTA; int mQkvLayout; @@ -3950,1957 +56,12 @@ struct TllmGenFmhaKernelMetaInfo bool m2CtaMma; bool mSparseMla; bool mSkipsSoftmaxWhenPossible; - const char* sha256; + char const* sha256; }; -static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { -#ifndef EXCLUDE_SM_100 -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9a176820807b10f588a749383d1c44f23bb3dc25df12e4a923caeab4c9e6bbcb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5a68fe4d3f52972233e7d25ba1cf4b19d88ca536bda57c2a2a3881020e521fa3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c7025ca4368ca2c5877558bf92c345e554722d7dc6b63cc2a18847ba500bd6a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ac9dac8402da2d235aee7b15461fa455658b1bae4294e07ac1f07391ba90a72e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "104bdec218bc21eed45e189a887643afacb58ab75c7bdef306beb3530d328425"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ba748a2127a77a0347795486f6c1c1afa7c8510881355a984e27d143242801bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "efad8de250772cdf2d714404888152ed5c7a14f7012afea154f5bf289520bf4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "dd4ec9a714eedb3ffb0955d0a8a299d99528a506b173fc23e88779051a6d26ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cc5b83f0b1e3ee121dc9f7c2dcdfb17b55ae37fd27462dca98effdd971f6fdc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "daf9f9e5be485d97ae8aff12d858b1908d91a2122816df8840cb02a9389c2519"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "65ccefad62f3d50738774e6ae297c6fae486bf861db9eb805857af56b080747d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "899dd5f6f56c7fa48e52d18c6314232052190307efde0659264ea8f3bd26ec3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "75ff058e28e391081af0e64633aae0a845dbf2195f900b8c3ae60cb083891533"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f4cdd3cd8d536ae5ea8b2719e25c55c75cffc3f003949ca4259f301e6e8ba196"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9db61c77449b6d4f7eec6532b436260222e8c74cf51d1a99fbb6311dde6615d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e5a08a95565cc562bf35d3e0fce7c237262bf75ab19c70bcb01111a2a3606dcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cf04c039f22f5fc9b51e10c741ea8b6c55514b32b4ae3aa46c077be19bf438a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "37ba73fe08bf06260d622b70f6fb76459318e475e11eb04e48a7930662edd0f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f76548d39932f5dd1dbf6227931db8febea9864a8ca0a5cb33c5bd9a6bd15e40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "09a630e67459114e94b31b24e3f74cb238e102771b842486fbd6c015db21b885"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "89aa4e32c12e9a400379b1321f984e480d56987fb4ee1c0b7834f45bc67ba522"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0b7f1f80c34ecee58370a97022d327315db7f8dc25c61e78de9dc514bdbdb2b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a0a04ce61b26f2d4bbebd21473089f2509772c212c9f2454754c1130c224c465"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "176d5d860be9cae393482dd75170bfe9d146fab044b2dff583ee0391b03bcc42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "56933b78d89af2b5fc937a729e8803c3b66e5bb71fac757a82df23dcd56fe993"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9695062f74aafb70cd2f6f651cc6912d3c111232785c57bdf24b52b466c40e54"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "dbee27129094dfc385000640e5792c9ac4ba94b4a86c6a32d6cb00b2a3789bc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "86358ae483adf6545077aff4bf2e9a547df94cb31e117834af217af47b1f3fe5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ba58721fff014274d3cca41454593271fb7ceb18d728297be3b26258caa592df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "d3e1a69415eb65ed13f10a56548c56039c8b5fb4700b3c216ee45600db92cc6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "59e61278596daa47fc62e428d284235e91006e3e6f33141ff2fec3b096a4866f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "4d21b42c58a098bd82ed43b89213e95231494ae157c4201cc369e5b9f105f14c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "eefd8a2b814aeb23d6375c913980eeb11618b87267479e166fba8ac180df76dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "088d43d76773e80025ae7b74be526cc6c75db5352d1f0eab70450ee0ded62059"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5b5500588afb1ae0b7701fa0ade1a45582d01c22da683c7573c0f5e15ffd11c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "3ab064eca6ab6410a4c5a50db61b5fa84cdeff359fac4b64f6b84c3ab2ffd695"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a15573bdc8e7def60fde2ec9851acd7b1c47aa715d378a1b6999364ae2edad64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2dd434eec9fa0d5729dd9bcff69908a27e4fcda92f90cc87e39ecf4993caadda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f1beed56222ae46a9401d5f2fd19af1acb4206571034380d0de45de600ac49f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0e97990087b7b71e0d2fa9a6bd33a5ffdfb89166097cc9800799f0aaebc78c62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "f1b27f666d475a4ce2a147f463e5590244cfafabfcf91d3e1e256ea4bef95e00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7468ff1a4f007742ec0c55d80a0f6ff43cbecfa9d9ec7b1a8a661defb9a4ecd7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6dd6f2fd0facc54aa0f09405fd2822e08c277a6a122c5c3ccce96085b621f6e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "27730c25edb8153b11f3d1483eb05b0e0ffdddd337c17225e9df0273490003fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f4bb2a4c5aa62fd750c35d02fc615151323852c96e7db6a5b6f4e946dc9c12c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "afe89c897631fa79c40d8ab41c3e2dfb6b74c8c23fe40f1533a60ee0fb679eb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "058e709a9984f5d288d5f036b4e17306e4f25a9f2cdc19455f3357040d57af0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eae703d47ed77d0c618064acd9e894603950302b0cbfbbe58546745d5d3a8a69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "9ae2e3d1f7eecd49644300253d81b1182be7aa6cb8642eda67a412ec679b7e4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5aac2410cf8216e50066d86d26448b7486a213a3b393ee2529cdd0a23648be3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ca752948487fd3173e12b74318515ff24c2c29dab9911caa9571cffe4c8551cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9de13b6f3e674e181a1d6fa83b3379e5d53233c87a8b69260c0435412b59d3ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "73dc4b25103728d05869b4cfbcae3ad27470cc3c6e529e6b1870b94d64896f75"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "863c72a6f1d28afcc0195a0d73354da7e07af5da52ba37aca816b146f670fa91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "fcfa117289d78d87e479d74f400c933e85f9b6a3731a98c208ecc689fc8d38ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a5ce0ad9e94fd6b1ef7110f2c98fe6f0165410ed6012d506567f132882a45f51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "07d6ae141516b7d3c9ee1a52d7cd9b2c0fc3fefb1f0ba8a101dcf457c14bf68b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b422236efca62d3480acf30fa6cb08a95601405299111a09467c1a92ef11f0c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6392d187a435e3f4de9293ac3825ef580e925ae9a5e3891f72f7d4a523a344d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "891d89b7d3e1bcf595422342236c1509102e16fc50ce4d1dec3f3a7296fedd52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3c2e4435a35b45893239b522f1590dcfec6551d9e724a528e084cbbe84045052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "abb4dbe18240b09d2b51c35331c2a9d7922c040f310b9e04841e02ae08bf9b1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "75ef0359bd85686a0d30a884eca260962f2dd5759610207d0924f8ad739d2ec1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b3c77953d8a748a0475aee6c788a58400ae6b44dca57df3f56cfa55f682ce074"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "669bdc46670c721ce56be1f17fddbcd7b9a596b303467cbec628f3309e623a57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "732e0739cd78890ee4a9892bfc5d78e20fcdb8a219ba8d4d8cfe14c1d3a6cebf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f94e77c36416243876c49d8cd8900bfff7b62091047fdf18dd37c33e1d5eb722"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f303ad74edbda689701f0b68c9d6a31f4213dc8a6c28105584d62cf727e7b755"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "8b20864dc92a5fb4bb448c91e95c7a8ae65ec83afc5a62e56e2ad3bb9cfe73cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "706d6f5c5a101340b007f8dfe4f3056171f53644a825c2d7d1b03ca60d37c5c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "17f18a2da5fcd46f2b4f41aa7c138207dc6920b6f49fbefe5f8934ecac4ab42a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8b920d35b9de005aae71e786c5d0f8c2e1ff3a4cf0909657397e3b7f02f9ce44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b552d88d491de085a8468d6724b425ea5ad22ae3e567a9724b6e7451cf7d038b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e0ee2608692b7ce27a3a5a2c60fa0e45f0424e01d80895df144e788c5a73c130"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "84feaf256a5f34b34852e3da1686eea074bef3314d861e31b42d94e806f45c91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "51e61d2d8bef61988b1242d79becca1cc8cf86d9020dacc9f1d86736fa8fbdef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "e46903333a5fb00aa0aac43bcd2bb8b08432de27167d70ee5b739622bfd5afca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "bddd3aa9389f091dc709a77bc887cba95435ba18abf5e74b6c0f57a9c91cc781"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b6acefdeeab699b3f6f2eb0d0da419c5d44353251bf0083648d547369bacc831"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "33fdce28359c88adfd47829bbd1ace7e4d2b42da776518f3990c747b47b19b19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d5da56e936d6d97fcd7d5e028f441e86bd5937d249a1b7764865cbd9d2c55732"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "74880ad89dc908c813fda616a9b992e2820881515e4922917c681cebc7dafdec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "78d269a80ec7e57a3a439b03f5e8096ef56e3e80ae5ea89dc483c1536f4e5b91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "25b99b3ecba8a22be5bc4d19303d5a43d7d528f065a5cba738823f3677450bd3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "51a9447b33f700b3fdcfea211f2e98eb05389be809d1de0ab79984bac47668a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d3c9c96ad45d607343093c382df2d967cba232d87c432b9d5671627a30f05386"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ae8492a52953a1b671b54a366a9f6e8abfb2b6a43e55c01233114c43c2e975bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "c68c9158c88f0285e1183770f519514df7b3d14710bc59e4f94e68c39cb65f5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "220a7a7960616e00e07c57384faae712c2c73b13d611b9cda562ea8df0c60600"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "004b32f0077bc3b728ff75a45f9c9c99861751868b70e742f55e2a4c114d26af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e334e8bffa1e4b53d8426a1ed779e9fe02786c689a1ec480460fb15b9dbfa0cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ae19d2d2ac022782dc92fac3f42c79e480633422abad50b243cd28e32642b03a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "4ca6a1c9cb86de7e69504f8dd0ad52de40f46f6800ecc701df403598386273da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "edfe01cdc13a53eda44df09da4260243accb2eff254324bbc6d95f945a699c9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6491108be3eaa9d3a305454a852079e7cebd5d4a9dd47c8d5034a71b15bff59c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "15053c497b53cdd92add998ebe5e467e95bd57202cb941369b8810a0fd3d5d71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d902057b573e72c81468ad3e21ab7bc8c06fad6b993290e0f6806ad7f29ee792"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "89479aa295fb50821d14b07ac5e6d39badf6af5952ed8b9e75886c0d23e60269"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "915571831c4a3baa43d58da0c872c6a5cf63a7076ece7f3b635e92b1c7128f77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ee4086173bb5fb2b4827793e61c8a75a7b7470a149abb8f4561d2db7ff67cc2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "6e81250777e27749c126497b900d238adfb9d77b6de67ac304933ff58ff4438c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7f8f6f473ed330959a1bf992f7c6663433bbbed2c1dcad97d061eda933cbb553"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "01e858a32b62c644fa1ec22739fdf1388ba21e9a7524355ef74b43ea14022404"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd398c620a37556d84e69b52c17ba9a4feee2fbbe3c97d31d28ff1225b47beba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3ff194082a2b62f60f7ea6b1152a9cb3dd6dfd8a33983d1f88d965847f1638b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "52a6710a7b5e7bb005299e77e8c3e740999bbd853e1338a29eeb89076fa8c0c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5aaf6b37b523bb1e8c50c42bf51879ec1756c1129cd0b780d68fa6d362ab3c51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "71a4f3d040218f98334725380d3724d3022b14451febea2af9befd6577fa0fc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5963136e972a354b423f4b5f87bce97a167e45ecbb90d29b5a9086299e13fa93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d7bdbfb47e2a6ea950dd36d073964f69e9347ac6b44d44113af8e8c24df51341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d64d27d8c4e1eeec885fc04871b263c052ff283187ec7cea37ff3fa19d5382bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ba999ce4f40bb7dd830a184bca7951dfa8bff71ef8e2fb9a4efdfe178a9dc8d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "3c5c8b5bd002495102e0ebc01d67e6b6efb8d4c345f464b8a9e3d6d0f6e7079a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b2916eba42f5cf8a1504503b93f7951415a6d1f06d935f4abbb8ef82cb9317d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3be5829657bd310d44197385ab64341dd0fda8d85901e061447c309a95ca02e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "a9c6597ef687ca0dd63c6a0d3b187beff05e35f6d3c1825040d7675eb7dc1d16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "362f02e7698d48b396c92659055af196108dee06fc2bd2e7bbb44173af3c53b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6dcdbcaa9f08bb6ba5a158c4580d44b7b059701269e0fa0d1513f880b0bc40e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "34cf51cf50bda51d4c3620578824ed250fec11159e1124e0ef419ab452067c56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1e3febea4bc2cd225fa762a1c20b7b58f1a9cc4104924490227c3c6f96c4182c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a17f0d60d9a2bae580742e3bdc2100825e3e3775ee97bb02c9ed155fdc4c3cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "f500a5dba46d45a27502eebf4d7c074ed03ad213b15197d6d730f926a730b62c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d1be4373212179556613b06d5da19516f858a72e89865d232d3f58fa2b72b038"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "fcbd6e0db465b8210c69746b24970733490738f73146c574572013b602beffd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b5f134c6a7498e79ba8f825bbf7e4e5d05605ac264c87f0562a7bf2e3079520f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7e821651894ffc8a196bd16514ea1accbffdc74c58f04c99f8741279adb7d691"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a49f8eb61a238ed1d4b24bfa5a438e365ccf22b5b5c115e65e1b9cc2e5099fcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8fb263b43dcf76d212423f4fe3291b68ef434a6c64d77888679254887386ad99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "add7f579c80e58a158b1c31e6ee7683fba9cc10f218b92d1f84dbb51c5e89102"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d1ff0e1fcf39b3f5c1b16e45ec330c7e632e22fb798aea6e3fec7bb1b2c9802d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4a038dc6820022d273d4d45781de7b1a6eaeb2fbf7c207c2714fa857dd87b137"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2fbd4767059a0fe411b2f9fa3b344ea9e03c9020baaebfeec7d13f7e7ed2f4bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0d24488ff215eb5a38cc37495120be92ebe05a3a6878b24003559a8d2d7ac866"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "225b7dc35c64ccc33d06180243fc945056d116b6c8134c71479cae625f20c44c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3610137ea0fc810d89c5bd9a1c31807ed4f2429ee5ecbec2597e9ffe86b19f2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "304e87adf8ceb391ee1941ab129d96b9cf7c81990faee7c8010f7298f8438bcd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "98bf26c1913472257b02dea317c02bda92ac1e5eeb1bde1783fac95f61ed29cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "561aed2537e7eaa7a4685a821cff4390f841b30a2a9ca78377daf18e5f24d06d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c1e1bc69c47c3c80db81b39547ca6502a9bb9422073c1d260b3177b64bbeddba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "d0b1ec60d316307cd0c47f96e931ee83233592200a4fcdc4243e6d70a913f598"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "66995087c70642b64b813a1b34773eb69dd9d67e214c27136d24e56362feaffa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d6329538d8fec5b01313090e1ae84ac3ae3ddbc96c0d62823c5ec2a1abc6861"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aea86c6548409f1a47f335d3aec8488390a12f7705fa62a21585469076de0b49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b325448482814d86e32e6bb82ea17a816552cdad61e1223e4efe32f34cfb484b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "0d10b8333b75d3ca64b1308acea71d90ceb6f8f84a169841d03897cc42a6add2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0ef97810ed94b7ff6b0722160d0f9c6980f7a482407726608d48123d734e30c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bb06974794b11482c54fa4730ccbf601fed79a0eba283265a05cf565c45110ab"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e64d1bb568e2dc9fcb7c26b8de6f8cd4893d2730d9d34b0c6a9cdf7faa9fee0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5745282ece8f2a11f6ff3947557f5e8a25802861285b7c695f1ff5602cf679bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f061d7ff7c70186556c98c083424b6e701e210285caa1f8a0a97d32b11d57ae4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "39b3703d50571308c760adbfe08c7d7bd4dfffedd2a684b61f446146f1b763fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2d5b67610c0956e5891007397094db6acc754885402d8e7e3c620999b7779d3c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2576fe18a70c1799b082a945ca406d5e34a04edbc82dbc8ad38e6a0c6246fec1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "30b189778153228cc3080c8f5250f192b9950b8a83c171cc17899a4c926ef030"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "360a2ee98ca1e7a2450fd3a697ebf40404eb1cad89d627ac22fbbdbd5f4ed032"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "851293d99175ebfea8e36cbdb70db56542eb979315896416c56920bd672372f2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "174ea0a369ddad5bc63c79507b146689009d0e5001b3bba9e2b082a6c953cb15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f231c5102782e6f55c3b10d6fddad400ef74213788b6ecfaca1b6f10a0930262"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4c3731a229c255180e2383b0a2f23bc888080795e81cb62fbf82b208381e5240"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "58df070c76bbaa6ff2323967339e9374979fbd918f5fa479ca664f1ace49e547"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "70cdb3642c3a1b4022a5a3cbc90e521205e8570c0d2544d478db5be26561e638"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "40cfdda4f209761ef3eca907fb84f09130214cab4b6d60ee69ea1672795bfc81"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "edd9cbf6540157c4a2b9d15161d79b383a077832f307f3afed73819cf665a481"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "41f9ed182f80238688349226b59a19f90106ff4b17910cdedd60f37d4e2720ee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "285fd78c597d80fa9c353a00c3cf37ce07a038fad316b91e196905ef4ac5510e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "1b42f5dc3d6e8f39b15eb98d26429a84ebf92c329714e453ec94c5c23a951cb2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "cdcd8368539d349e56a0da3f917d3aea8f1bad85442eca9e71cac099931a5cf4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3a745ec8ab72d6e90b807cf52d631db44d7b4fbd1490b04ecbdc5ddc8d09f36a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f5d7b6911391be1aca7403b8f0dccb8e0b0b42bcdb1f703ea5e6cde9d05dd568"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cedf712829ed7eedb631e6616b51b20b6c40f7f71bce5aee9ddf4e959ff6d3f0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "988e2e64cd365677f93c1b9623883c79cb1045ec02aaef2d20635f36e0195ea9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "128f37705668460ba6579af4ce7bd3864bf474f0cccf9b6444406478f1bd26a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "62f9e7bd5ff48d0d4b77ba80d4b1b722b60762b0ec778de638c9b1a2754224f3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "14d3e3c8c95131bf255fc36e7c6e45773dbab760e485ec731477c0f2dc4a7d02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "934c2d4fb468df623d74b4632b2a9c939a6f95f849f3c8a1beaf35e669a25e85"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a22a2df86dc7210614af22e02abea4e8c7eff6916545a5ff7c5d8da7ac443c36"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6d3548d893121af62638608ea3a0b6ce0dc66243292ffd6324477b6ea9a8c7c6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "edde20adba8bf537b44729dedebcc263e4d8db0f9ca0754a5ea3971f73e13f69"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a3a49eafac4fde1df3976a08d56a347361c190ad316d52d53b7e115cc985171f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8fcce917d59b60c70af6ef18e57d57e238b15a28b4bddb8a63ab683f86e3378e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "523fc0b3b8827a56176e7894ba229bdc7c824ad54c0465568042f710f6abcc82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "88f0843d0c1308fcbd0729b321b0a84aa6cc75f19f7b1788371ebb4d268fc3b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c3c5bb3f3002b8c12a154ebf967c05e8c58cb0deb2f85d83b68a4e2bd5b84982"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "691fc35d4b75894c36e33d3236cfef861a454db736a68756588c31886621dea4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b346e12d9b29ce92015c55eec7eedc3943adc616e196ebc8c65684ab8ba211d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ba50b9aea73f4b0a9a1fff264affad591a1307e07d3794e8f018b0f7d8c74993"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "84a6cb586301499f6bea219ed094e0c87df0217757e940c963a3a4208dedbd61"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "97b70df2d05f5063c18437f97937d44421606dc2bc605981e0265e1c9f4ce81b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fd0462926b6a21909d90ae77aad39edfcf53b1afc777bafe0c56887cd9d73f19"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d86e163089210d1517adac7970151574a8c7ec050f55ac8eeb1d01b4091d983b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "af05a175e01e48825c7a4e0e53b38388feb464440e950e0a0e4794538214f631"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "28d3f9a2353563b88f9ba35bbe4a53b22ef53e614da53b426004d06ee64e2332"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4844c027bc8b7b0909276e6538cf58f9b444e5930be9fb026ff9d815df6f7ea5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fab3d3b7c808c4c0fad11886427802bc38ec8316a723fa064eea9dbb2547379a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7468098ee692f39f3afbba7a7e7eecf095eef3c46b67e5ae78dd0227adf119bf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b164b53bdc9f265b27bc47c5aa7c4363ff83fac3e88950ea549308ba5a5e0ae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d9fea5f24789a73af71c33264f900a0a084619099f35d7aa168655535881b603"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "75372442dc47d1d709bad3ec7c53051c9ccbdb4092993167e30de7cd38ec4d7d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "79f17ad517131c43bf9b7fe3bb9b00a55c31a76e7bc506dbe5b45315df484e51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "be2d3ad4a316e2a084067d79980221cd36d5ccf0d7286698437eb2cd4fa3cc9c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3d096ca0355ebdde61032cf1c64b2929f96d69db5f19fa8c9ef0eb1e7f7d222c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "00b046486501488c634cc21e5c52fb203fbd0a3642ccf540f01da82e9b673de2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2b0bd4206a8b2992313d99f72e6b9a63926815044b32ed11c0278ffb4e3c17c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "25430c0e17ab7265bc64de4505af06de58f3c937c0841a6ae126ebd8f20e4c3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b64c18ba0c89a875343843fe48bdfaebf78c611ae0991b29e74e172dded995eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0e7535df2b4da1d28171a268520c5e5c315dc10625b927f2144601fb8fafbe65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8bedc7684f06091f019605565a03c006a1ebb06f9fc52e6a64a161042816d8bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c1e9fee0e21fe54eada9e9498e268bc3c9deb6a9bc0ca4a1dc16f1f25ebcb323"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "38c619de4b7e892e8dbe1e9dcd6d5cd1213c0eac5d419c3127ec2139de8feda5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "728fbf93a0b5da82531843f1dabfa34621daf3ff5a4821707748072b79c5a626"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "43de26822a86d91e1d32c089218118b6f856beff1d1fc8484385c4d842ed6279"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "30b7f6ea197ceaf087fa0a1d66dc21756f81b6a8267b665197f2c96d7139646a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "30a5b34cdaa7fa6b304a6e8965329570c2a4710bb41fe6d8b33de40d2bf73153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ad72c0c73ec5cf21ebcc5e0487309c55a322e68c0ad979e42b42912e8377f5ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "096018aaa2fc98018ea7e485b6e24a8a9d0d4b70ebec4c22a30a35e004de04a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a151f63cce163b64f5bf01120517e1cc3ec14214d8a0fef41d57a0215a2e39f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "33411bce48f1d810602fc49ed92324bd904eef190ca19edb5947021fb5b64169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "de0b120a994e0c7d94d40b4ec3c78007ca45536867011c58cda26ffadafea397"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f72b534246e9d5c10c34369557ad096bbd2bc13220ad1b0dfe726457a9c35a27"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3016c3485aec6bf190897476eee2d83c926def5c1d703f38a105ef4285135f4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d642ca653d460a0e0e5b47d1aee398b13a9e390eca766a0e1efceeb03d12ee82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ce99dd160cd8b68ff089326c56218e8293f9229bd0018dba5920121b13beaa45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "248b34b50a5991cb56d5468c149188b9782281f61e0ba987c5b9b3b08cfcc72f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "15c9fe05cf708391fd5366db20c132e5d7ebd3738c9615353ac44aa096b1a9c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4fbb23e29da3a87eb2e8096af063a96f124e51d78bdc7078021ec44ec171bf72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "05928c0c6af7af63790b6045624b8b07363578f201d99173ee389d4b1e23fdf0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca768f2dd7d55554ca31e719474a42120084c513c59023a63705118c80e9c919"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3d86b6fcfb862d696a1ac4b7eaa8fc5a3e8bcc54e85ddf64b1829c17ed299559"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c6f1bf6be7f6e69ce727aeeb9e6a3a1b8bec3724eb93eba0642d05cd54b2c6f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "091432db802b2b98d120cf3c42cb3e43e0f8b65ff7749fece54eb623d1bb87e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "8448bf00d42cf5e0413494f47cab45d228ba1a0f4f2d23b8cad2bc6ab7a5ad16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "07d0858275796e475f7a67f3863bbab6a1d9e2d5c9bff38436b593fa5f060c4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "17774a61f32e41d9c79fe01077fa050556bdf9772c737889a183a88d61414583"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bd9ce5f85e27131e1914187756dd073be1e946456ef5c7609890c1320ae61dd3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a52884fd149d0941e4e9003ae0d1555edd0aafd1e6965f5bd9b6411c49d95e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "5b6cd89ea5356ab0a71fe782b8dd14e86426c933114f2e7404fe901312e1386c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "9d0bc5a0a0760522873ba71e5ae364c8c01d0d697c180ddb4c34686872bd78b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "654f5101e202198fba0c7cf0213b3ea27d550f183b84f3016c393db6aad0771f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "db2a8093e717f5507972c7c8dfd974f3aed7c1240892998846fa09b5fc33425d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9940c721d27806545e22979c70b74fa77ec901e253489bbe9bc3efb154dfbb92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7e46fca53b58b0c88889b1d54f07001835b526dca41e16de55db091c61d488a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "79891c286236f125a415496ffdec30b9f693482d8153c3d4f6c9da2c5531d4b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "b0bd4ab823dc67c88fe798c4e6ee75397954b9000252bec56d3c0822eec88382"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "43ccebdf5afb652c0014c739619b905f6a4c23a9e4181866ee598d84982409cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "47eb66d6637ccdfa086d8d5014433f1fdad0b49eac00e9018600cc742b78c343"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b36a831c94558ee1c75d868cf2bdb27474cdfcad9e233743fd9130bb3cc352bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fc6dbc1b0d19ccbfed46bf156c6f4d5dcfe4f117c4de3eadafb7fd9388fbe6bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ead70acf719412e6d7314a18152565dbac084bf9e5b5e40b0f361036e5676e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "62ddf1fc0ad2c695882f71b432ab312168d684455a7de5a8cb4fd18dff39a52b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f29798b4b467211a096eb1547a123a86d7532c9f505d9833202b5130dd412234"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "46784c1148adfe5d3e54a9ec8a8a8945edee6248de64890bf9bdbb3323c7448c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f4b418d98ea96c5eec974ef0fb11d6ad7a2a5fb01e7f31e0eb0ad2607e7faaca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e0fed08aedc63377b016eb9d51f56c4c0778ad77bd7ed9065f74b06aa12adefb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "784cb1116acc421ee239469372c0d3ee5f6b5248c5289a6aa1fc46f8c1351311"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "eb50c70feef78193e29d4cc7c18d22bfae5919523fa5600afc8dc4cea8ce50c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3d9743a13bffb04ca16631ef9cc9415b4fb61abd85834948e7b1ec19013dc290"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2cce7a54bc71ed307ee92b751b245e5e1a96045d5dc23ac9d0272e8b18e14b7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6faf09ead6025da5688ea9d9497199c96c50112155a3d95411c0ae45221775fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3134f99b1884abef7d522b565dd6eb1f6bdf4d97e968846e7c056b7aa32819b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1675df4c12784931d487b76785f477286056c94656ff06a12c3a27bdcb5b7d4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2b5b83d88ed18b9cd246d148750f4dc25d7b99d5f63215eff8828fdfe772436f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f6d9c1cda8a696f3ba8d0d25e77f11933516d737295d5d72f81039431ede0dfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3e2d8eb87f7b80fa5520ba90a81bf419e5479fb11b7eb69570ac29ae139f2664"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ef48511e3fc3f2131a45923e22c99d730191b68c0654a560cee28c1f839889c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "65927f03e0c8fb2463e61185eeb4459f7eb6cf3a5dde60e4a1e059443000103d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c1dc8009edad2b644408047c148b0bf2397b7e86ab8d0d444aff62f4ce099754"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "017ae5733367251d3c9485c52b03584dec67dc51840d766e2b52dbfc7c0311a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "50d1f70449d09387e02d328657d16fd756ae2eb43ae2cce366bcd8a12ddd8cbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a63adee229ad5bd427701ff68c7f659c720d74e814a8feb4eb6ed614f1cddfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cc44cecd73dccf2a14f0359197ea5988f7ecae9bfb5c7d1828c0df6b4c85e9d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8ffad205c54903ee7f8479942aa7008c871c778cb6d9e63f5a208c4dd0d07b00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "06bebcfde741d33afe83e3e480c2ab55f5ca120e22cb613e1da797d745d35cc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8662952ace08cdab7261f5951edffc716eba6cf5824d921d0e35656bd0af5590"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "14b0fb3b21b473584a98440288bfe0d098d55f197b2ad3ebfa27bd90d3b72300"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c5bb8ba04355cdf068816e65ab590881bdfd181dcda163dead26020908489c84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "52c4499452e185674a37a9d972c3fa69410edcd97199d17b1016ffc3ad1a77e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4df2e068ccd0e38d5cb2380ce507c695884ee03db17179573ee1a2028f22b3e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "44fd7cd29958d6630a555fdf2f63754a8c162e20f63c64e97a0aea62b47b7a19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "565c94af7b1302931ff0eb325e691f61af5f9959079028f0914e6605f67a768d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f8705e7c1ade0fa1b742140f26b99159faad1782cf32d86e94b7fd7b7a41d012"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3aca6a206e4b75966c746275fc543a5e23883eb56887c9b1ac5eba1c54a240c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fbbd81a80c4d5a583c285b90d454c49c114e08aeccfd4464fefef4b116c18921"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2c904baf842d519d6694278e02c8d6007a8c9d269bae98649c4f8815eddddd5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e2e89c166b11ab40dffa4c17e6491ea1d3d70ddaa37e2e458d8a8fc3659ab777"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3c1d890a72eba4035a1ae299ef88d4fa9b49339bcb06ffbc400f69a9f22d7682"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ce8d1aa3f4f88ac82f2c67fc265552077cd06bfe4ea5bf7b9a183291c4236244"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6a42dbe4131fc7a832ccc8afad002e96568612a2997a7e66531f5ec714dc3254"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "9acb343e29a8fc3f5b7d72d2d655aa853e2f8878e49f63a0341127044f0aa7cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "98456f49c3b0b96e4262759864a2c0d00400c5d420bdea2813b7acb667717739"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3fea91b0ffc061f792eda7469778f92297e37b841382af54fd06685f8f3f8939"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "12a5de4615caeec946d1c5b05077c9b818624cf210952e297ad1670100192758"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "086778dde4f1a41c3514ed1ed93d59452d0edd8897eeaab0b5ec7a548a5d3d3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c4217d7f363581ca06c28032f675214950da9dd569ac0b68699b2b7210b645b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c6e982d44fab3b555a12cb80dbcd22098c6bd38ce044f051d0e91cac070c1d2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "29cf1d6ec224753c06deef3936dffd71d031d3f371da9cd6cfc8911cf3f4a34a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "734ae1e9295868c9b4ceab477a37952d3e6cef1ccb324e5ec407885f51cdcba2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "43d755cdb6868740881da19356a9954826aedcca3361e74cf6aac714eeea568f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "521de02ca65a70565832c1e36ef1cfba2e3732a288b295003df796e8a1811f65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "890f351adeed7cc61fa32f105d9835fd6ead944170be82463f2ae1d3b4510bc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a00b41e480b0b996e4cabcfd9153bb2aa0cfe880e44e3cb2e8a09626438a7492"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3cf27418b4bd2f68f77c72cc2c4727ff16d22a347cac3286cfd1dfb8e22e5964"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "692b6d122243ffd33a3fe4c2736223b7c4501060ae76f85892c7d3d08a6ae14d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "fda2497590ee4f894b71570f16d4e30dbf756400889eb00b21251adff674dfba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d3795ef2472b35cca72394dd3652e80b697e050757c3c081b270c6333898d067"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "da22bb8af036d7fbfc381afae1694febd958fb87d419f09d23da1b30a43aed08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "39929a0ba84a775e65e7fb37fc6451cb30ac94101d1ed3e85ed8c8b639baae37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f7fc702d67d904c4303bf60c4963f3a36e5bb4e20581682ec9d81aaea4b56233"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "593534de1d7d1184065a67cebd293e0f9bfd6bc7bcdd7981b38cfbfe78085b85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c68d7d13d63258d49c819a2d63835da3047e30fc4c2046e8aab619b4c60fff8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "75247495cb3e8d2217e404157625aad7dc27c17db852c7d94a651ae921ea0d05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4bd0f0f786dd934611e4a0c125da71b24e250e6e258bfa13daa2d0e8b3f5eccb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b6010ef4ef174f56485beca5c34cda23bfb9074c66eb92f520ecc921dbbc4dd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9e186ae186618cd715045523eccdb97469e0ee3888c0373696f6137f0d9a49cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7b5e9af6391281959ea7cd14dea905f9fcc719c90a49d7132e8865d4fc70a7a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f1b66b7fa225e818fd5f47ff4298e84b1bf36fbad76b35d401f488ad6b394569"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bc486d0c8030a057f509b671c2dca7500b63e285f13d58154ee7d5eea432fed6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1a5328d9f3000ab01f0b655440e3f381163d47a919e6865e138d877d6a1133bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d8096296aea9785b9f29889244a74127c0a22d616cc24ee4b05593b404a9f793"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "30a234389152b52c41e6742863dd64249abde514469c3936a1da2ab524ef7fdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8bf67862c52ed34f6e27bbce85ac18222fcb9e96d94e842947625fe517e179ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d7edd2b5cee281bdfd82bbeee470522c5d1083dc5a0bb9b641d9476545160247"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ffa1e0c8c4ae2d277e43b392b8a6c58ae19858168beb96519793e97ba4806276"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "36e133cc7d4be469a9198be5c042636534db7cb5107d004ad59eb0907d7c8f1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "110fadc31f93c3faaee3e589cf978e2fe8df57d6550bd5aa3bf89ee5050d336f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0f9a3508e56c111de3a2c44f51d6f1cb38cee388fa8ba6002b13cedb1b0ec9ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "add8a3e810f284d474ccb89ae3cc4777809312b661c491bed2d27f048032d24b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0098118e55b9bb97cdc72932c63fc900fd04359f8cf070356f10a71762b0e9db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c8dce4ce9cf029588bd2e4aefa26462c26ca9f7a2af6b0f3f1ea0b35d4a6149e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "de35652f4273e1ab0dd9b62767e1e9600cfbd949b3ecd8085eb203eaa1e84eda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "305773b02ead09e5cc1c91f9f66a382fed595feea5334ebd2d4febf0eb82f735"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0b93f9d285e124e338c869bdf405eed222ef420497d9e6308e113e3a6fe6d6e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e88918e64a3f645fbc35d7c31c12da173c7dc9071bae338907023eec07e4fe6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ab133cc888d89d58b291a08af38a277dd554db10e937f4d957a6fc3d423761c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6c3226b241a4b60211039127c4fa66a35a233e6ebf2c6c8a3c909f5956c8f30b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6d71cbceb2e250a6416baba68380a3dc3fae0f29920cabf98faa0a07fe710ac2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d1c24b376660d3827e1904f3460c81577cdd828b32b3d7b0f4c79dcd142d63b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2182a36ccfc2c6a7d2bce29d8d2617fb7506516650885750974002e96e841d04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "cfea90b58576b76d456c5448d44835d1f8865bf41a82c7f8d779a3998124af7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a64cfb54e986a4a0a55445909ccacebe6f95e8744c6a6ad31283cf426039e4a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3125bc998f20bac6f18139da940800ae165228c9ad5464445519f2614ae6fdd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "42d765817be68b343674f4200f72584760a7b8d8e60584f714338e890a785f7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e0629605585a9dc3ca29c94e6d3ecb090f7390aa7a2144dfa946b5cc3ed51a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "07121132ab2a614e78a6830fa08219d5b371886d72efd10395a02dfda23b3dfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b1b4784f94fc99ebcb5b7cfbe5eabdd646a2a614166c36a8c49568052a49a114"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9e78013e73a51e7d172b799893916cb7c8ea77eb5f1e77a6e64fc0be2ac755b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "bdde2454c5bfb8f89e2b4e3d7a3d46fda9a2094d0d85549b859eee0588123f9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "1d58322ba771aa98b7c3673ce29486e0fc61f0f31b05e3670b051a3d9a9f9ae4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7ed8a8f825163e0b7d616dfcab21e70fe4cba85cc4c3b20b96ad062a3f0ce84b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6f6760684dbf8003eec4d1da8c4c7e7c7d16cee2b520267eba662fc6c07a3eb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "13d3b2de888cbd6b55c1171ba7a864977fc0ce5b9c5118d1e06b4b55b8566b6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1f8f3b98d1e19b952eba0bb3d69992fc3e9ac544352280f4140d603757656c32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "98c0b51f856c3b5bee1fa54df507a972a514a14b8ecf8d510d383896dd909ae0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b26fcab6ed7ac07620491a4984d09f258dbacbd1d69256b542ff94d9156d5833"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0148875e458bc35ccffc73826c276994262b45922e119724056516d32a246094"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aa5e40cdb0ddc1965baf8a0553ddd1e369fc9d74f566d2214ff2a0d4db410942"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b22913c662351ed92e1ba0a3f3885e5c095ad6b6ade755e05a62257f50c64c87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "910f225f987bd7f00f7b7b65454f5927968b18fce3c6f980f786d91ea23b631e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ae2edcf5436a32556461eae6dfb3f3206923d62fe9ae89cc446a202828514cdc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "fb8b8fb60a3b4687cf5e6e45ca09461f8755e8bf136ddc0d10e46e4a8e201434"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4620892c4f107aefe0c14b1ce834bd9b3ebfa8774d04f519144b1904ec039e79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "89f8a5f44ec289b2117f10ed183983cdc4ccaf175818c4be0f8ebc13aef19b8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a50f858eab3a7eb7e46558588729b40cb4a2f9b1504104fdaadb610eba32eaac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9f541c23cb94c5f07f373afe77389acfff234285c3ad3f72d0fe5cd94bf0e91f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "43576a7bc2622ba66b34c6ed8ccf3e99c0b63605aecfa298e3491458219f393b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2000aa4985ffb56c987879d7be43a3e4249f771010ef8e9a47b10780308d02f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0aae3897ddf148a43d8dec8bdf233abf0739f21dca2a28f3a1ebc18940b8319d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a4b3002d7bb2131c3e2ae2759591371a80832212af4502a6212e8b8551c67dc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "41d810a4b8c5ccfd136174d43fa28761f2839fdd8c379d254d2ac756bb4ad965"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "050f3686439c173e8afb9bb75cac264d23f05c24aea7a1788e134d5db529dda4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e3011fafba05b8af8e4b2e70bcb15b50af3636584e6d47b2d091f5d4e1ccd4e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8cec52b70694ef4fed6d485f0b6c1c1a9f4d3cf695e5c94d3c37cb66807b156b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c30fa0179e20e222f3764b0a826aca4bc7eced3d9aaac0f7001d53faf2a0d144"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7eb9604d6a3e54e4073c1f436f2ed2b9bc91f4ebfba39966168ffe6091c40876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5444924f93b394e9c7d00d702675a48d6a0032906e5622285560cddf8f2932ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9ce39bb886e9682bb5b4da9aa69821035949609a3223dd9d9f5955e661edd27d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0ceffc9c18430dfe3360f9608813c70ee8cc566d2246c0d197fb4b8ed2e2a165"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "183b55b7beca1aa78103a4997ec17deba44258ec5c78d493467501ea6f2c0db5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "654ca58beff79f76d648bac3c772d0872dea61bdaca7aec1eb502889f68747c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "8850c7eb360b45aed3b3932732a7b0d744df28ab646508f48247729c79a05b77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d4f8f441997f4a8992808079f82680ce744c9cd22c023c343a17b1fd5af5788e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "74e3e84be6f7af61f793308d7a6c4d94e62033787aa206b2d1679b398979a2c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "77a15d58f1a519a7ea2c0a8f533c76f5b32f3fb3caac4bc34efac95d912ba7f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c312dd519690c409cbe24111cc105c8bccb42cf82738dd69728fc520d714605d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8dd12d3493d76d7841c887de10c722a8d86e91d1bd6002b318e9f2026fb90414"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ddff00b269b52581601694c3f49c7ec304ac157ba127d4ca8e469d7a117d028a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a5b881a7aef755621fee6a045e7e6d0f01f851d28204022e9cc671b2115bd99d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5685fdbd2a78c40b1a3d5d71573ef043c1dac6309a14d7f0dbdf1adbe30d25b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3bb2083f6a45d45d15e155b472602b80f3467394d906d2c3293825690e334b9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d581eea5ccb7fc64288a6e3f55e510b277c0ce0ab099b744f0a0cd9bdd79b9e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6063d1d036bc183a6b8f23a0bcba4e7e4c132545e42337e30e556748861636ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "897f3cf32abdef81f60916bec2578355725468ea669b5726137585cf6370355a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e82bae7ab0c0cecb0974365b6a28c40f327177c434025871c8a55a29ecce73d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2792c2339a973b4a7e46465084f26b24a2624f50fa76cf4cad7b69754c735481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0d2c6c5b7ac20e1139c374b28d248677579d733c0a194e81ef9a448ea03962dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "af7c49f208648cd5e68d635a21afb148f4213bff2b880426ed1e9ee2f5943108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "71500417ab4bc8726bfcff3d9b0d832b5d5be807e251770264e8023c78517b23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c6e954c0413245aa57ab8246d9b13e2c2ffb48668cc84a0a90bfc381da170fff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b5849b44d0b67452731bf31f13865c174acefcae3c065a4c15cfb0ae299e622d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1d6622ac02d98a299d01cc9672bc0ff409afb4de43c83ac2f82a36a9562cf960"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "99397875bf3dc892731ba5fa8c3f5b1f17a588a013539dcf07bf7016413cf032"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "cfc1d94156b7b33f0e4438fe8a15445d45591abe23720c044621e378bac8f4a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2bd42e914aaaefd556b16a1c116e5b6eeb725d4d1300b9a5c9cba396e0591c6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d431ca520d04fff910366f941a6f90ecf94abc5bc1439a04bd668782c915e979"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "65e1cda0285e8381ec04e1195afb5a96124e451cf5a47038858f32baad6e102f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "71ee27f4c1a45c321e92e63f063e946e6d5dac32da94b3b41031be6c58d73d2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b12b4fb38cb5ae7a06b89e91e2f2682da2944bdadd1ae0b71ccf71a362cafbf1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9d4ae2617de40e420b6688d972d57e158307829b83e61069d50f9da55f006e18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f0dd2127382a5befbba6555b39a67828734767d15ec1293a6eac9e9377bd4146"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "51733e9ca073a2fd83a52323d640993d3e51b81d59da8b633e68708b5c058e07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "cc0208bd2c6667a5035b52dc665f222b8570d4674e1234c4a1e43e0cb3a62cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "068ec55e1afc1255e3019651f79c0397bd17139d61d12915c0ab0a146699a4f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bfb842eeecafaff0d24ccd27066b0d0ec324756782496204560b6c1c29fe9655"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f797c65d3e7d8b904d46a788806739f675a28ab417e94fa8378713f97461fe3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0213a682c6ba2cd0fff98cce71e652b0e7b4f85bdcd412b587ab28ea0f424cca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "04e47b8e3f80c07cf87ef1d5cf926a69f366210a4e4aa671235d3941930d0d48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3b7ea483a4e76bb008ba12003f96aa1ffb9ffd60d402976a990764b274727fd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a016641a00963ec80e7240695743cd1156f94a1069bae597cccfebe5ee26f16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "352bcd97106ed61be338bea371574ef03636662d77a688342d782a5b17fe799a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0b1743b2bf93c531ecbe9ea4109feb3bce8e92bf0ee9bf9978eefe0cd8a6afff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3e6ab2572c61239f2190848118b2469ef2aaf317a77c97bb7f416ab00a96c814"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "7c1826de40a5db061844f1a35fe055ae7059eb8ce44cbd3b9cf8034e94fae572"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a1e6d7e0aa1d0415180abc3bfc5726f73eb83f9a62f4bcb01e897755b2ba8dca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "859f3f1a7a5796fac7b67e52c24a193223ef9e9f18c9623ff172b27323c8e639"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "15abb3a291c20dfd5b35e5512bf1a7164670ec177359f49920ed27af440d8c9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "40e1505a085d8b60e71f280fea917fd7e43fdbcbba64ff3a9f83762b439da4c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2a5bea1783750b354da40f249ee1a73446d2a1aec65d0295895ba0ece9a50dd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "6fbceb7efe4904474e12257d671b5e4823e5a32278a7569d8d89c04eb110aae0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b207f39712183f7c82e2f45495c2e5a25a5ba9989e4513513853f3d718064cc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b0315f8b239d615b582051438f9037a27a55a283a67a445f0863ba838ca7ee03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6bf157c8e0b8c785ae52b61f840c276285da910ea71e8816347b0c2c97468773"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "9ad4f354486b42eb8db9bfb84fe58c3016edc337fcd11b413eac23211f61baf3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b042314fe52bb1636fbf256c17ebfdc06eb47b53df2ddc1b30c915d7888f00bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1b9ebce31f1d59f8205eea5b086d0135ea70188478290cb59dd377195e523f20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "85c6c4653149e214702b8118daedbe72f2f575abccb6942281a6705ae0541485"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "79f353f2c1b8e25c2ff52c38843aaeff4e733a067728f3407c8ec17e76858347"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c15139d4a1cbc8934053dbfb0e940f67d77784780563812a51af85bab3973125"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0924650423561acf4606b264cc5afe95ec4ab5787c55482d7f52fdaaea8f091d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "66a66d41aa2faf8cab56d3bdff104f2e3e577d5a139b40a67a966085ec5851e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f4a825c23038a3cccd65658e3e8237646e3b4cee20003657cc98f975620074c8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "aac573a70398870c52400978d8f9752448bb76daa3eefeb308b2780a6ab3b79b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "467cc670546b9ffb4fd72c9dcb9ec47d309ae45c36b963bd118aca319b132040"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0f040a61f6a8431e543505bb26ebd5118824942fb1aeec2a3b4084e97b4033c7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "590242ed84abea4a72e39cb766d19b6e9a7a591cd16c5b63051dd7d0cf869760"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "23d200ba633cc282eb880bba15f4adc6337d6f14db3d78aba7912fa3555edf08"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ee1f6364fcb88f7afdd0f54b928db0c40f09da064aa2172c52d1e9128dabcc4b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d99f6e607668c372bf88d73ca331f53d6474208f33445fd05e08ef93733aeaea"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ca746cd641c4505a964dd7e577db63861233d83923c15631087be9f005d53bd2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "740892943888b812359d8565184ad79817361b6c8235939efade9732f0e989c4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bd0147a41419ef8b97c3a1bb8376ac95af0af2f527ff685005b7980cb5e17d94"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b9528a0580b1d286aa32a463d4eb6db496a65aa85bee311d23785c03b8f8b7f3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "82c3f927d1bccbbcabe0d058f0e3d5d8f157bf38d7ab3f80514a26f88bbd3c00"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "367ad2a08c41a21f5dc9ba7578e2aceab9ebf58f69034e40473fac8ab3b6a647"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6256cc9041ae8fff88e876bd57fb8c6492b04a16b4b2d774183cd789721a7bfb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "71380233e738de2b8dbef1ed978e5dbe20d06c39966397aeeb8dcfcc48accf10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6b851bd8f4826e458d7a9faa52f122721340a35feb3696c85f202e97971ce05a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "255981fdc6db62854a539d2fe6db24a9b7800bf91199314eb7710b209f43e5c7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "556f00a3aa41aaad8624a3e72f8a3e9906b971eb097d5ab01914763040a8c27d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "05fcee4b9cc1ab02bf3835fb630068e8fbf1c422c64c7c67629ae3a80cf7a1e8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a92df6ee417a893872240535368db021cae626d41be0103718a33fecb52aaf48"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "53348ffe0f0c5f8c645aedd952c2071e93883378ca57f1fbc5786d4b5f3351d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b00ea14835937bf34fe30c01e7eb2abcbc9366c79cba6859955fe298287f8fcf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "e783c3abab7944891e2161be47f3430a0a94ab7db09f5966cb1a96b74797c769"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "50594f59ec2e4c7c00fcac0bf128559ab95d1380875cabc26a924bb78509802c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "08393f906beb50e1abf8dbae06b36113f35a46afe203a19825e95c93fd1a6283"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9027df64c85bcef0acc2e75342724b1697ffc05c484c258b391b86030e85b556"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9aabdcfe93867aebccfd3e01788357161009e953b1ae5658e76858b1a4653b05"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "cafe95cd5680970faab01fd2f1be2578ce813f18f2201b1851a69cb72f5b7376"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "022475ec2f438667c63a8dad02d234081e9ac33ae30e14622d0f9242464ac720"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "496dfba2d4b9c8a66b668200d4b8189eb7a3d02684adcbfab3411b5d9af45f69"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d325d98c22a31e0d4552eeb7fc1918f05b94a875a0a43050866fec776c89b997"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "98fab2c21d54a75cc38492bf0eaa25064d7b36ddb455c847b4bf161881903c40"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6d1a1ef7b89ccfffc362f7d2cf2714f93b5df962fab37dc90a3f3726723c7acf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ee68e4f92933c7a3f0b431eb7637f4075786dfb77f50f9f97e73afd7ad1d7a11"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7b5d9180d0f03783e0f6e06fa051a510684b6423be33579d27a09144ee47b0d4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2c61111cbf0aa34197ff002b59e9c7b4532ab4e26a5a95fc92ee725c53285564"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "15e600f527f6b73d1c85fb7f45360f3fb5c7938f16911a0d50dbb7d64d5b446b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "34f2cf5a672d74e758c81dca117e71a1fc3fc7cfe09e75b78bd9b459a7294f6f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "aefb92198b6a754f083eda9dfd9c4e22ebcf1c8fe2601a2063c73ed1f20f732e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "97551e7ac60628491cc5c722f3acc155da2828fcb10c75baffec31faf717ff4d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d23995e9fdad42ff90c1ec3bfd6eca6980d580bbcd23ce3ccbd24a076070d22b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "90d0320e6265ff6e74c5cd77371c35a8305839b1f347b44583457db6b3b45709"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "53b672f76e70c5d2dc04de59ea0bff6e6e1a413f496d472dfb5cf6813800a19f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1e4e4d2e7cb2cbf3e000fb18999b993ae909b4bad4e68ceb7d39b71b53993293"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9d78be211193a14a38c03a48b688c4a09fa2321880e861fd100b5889bd5e1140"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e7fd524f7319e85fa49122c802351cbc1a945e1d33f6e95d7abc087b0aacb992"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "48950fedfb3b7209d23f3570ba471ba8b8fe483ccff31911f1fde6a94a092024"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7bd5d3c4e44c907270e51cf981fa7985bddf21149ed8fb6fe3cc67557fb556b6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0d40c5aae4dd5e34d95db8abfeed56213b25c92a3d4329e64a79c8c952b83779"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "457fd72674677ec804c45e58a17b23af37f41ea0ff12c9b84318d780289711dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a5e289f2822864a6eaf7665100e352f55a4e0774639411e1ab2a83dbbc00d4f7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "9a08209488f5387fdf6b3207fb74ac76fd3aa98c21308f7ce34a4969fa4cefd9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "14de1e74606db11be54dcbc7a4a8c0b1af700bfdc94987d25ff79fb7b639109c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "630c13724eb6557597ebfc2c2df8bc7cc0e754fb3f624220e25f297144d0fe26"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "71ecc9c0a8cc63725aa35971fddfa4f7b019de673115b0398a88c514ab508bf0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "eaa4ae198363953d69a21794a98eb65ccac700bdd27a77ae187789bd37c11365"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "47d06fb3da44c55356d36d7ffaf5274eab25d902051bbe4e6858198cd408fd51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "756842e32310b3c2265d43f2d8dd8b65385daad8d20e76b2a9bc77400250debc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2be7ac300c1d79f00b8bc5dff55831c674d599f82e7ce822f6fe7f8e8e36612f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ddbfc1ab95216bd40ee85748c147bb2f59e898e8895a0119289feb01cf90e068"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d3699a3f3bf609ba91575c5af9b96f46b0f35d026d5eedd5ad29f5c0b4f14f4a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "21a36164502f71ceacaad01a7fed899cac653eaf55181497880f7717dbd7a607"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f2558adc88c76698b16934bdb91ff87486c13fceba0cfcba4c4ff7380d6cc2dd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e5d6f0663bfb6568600995567627c6ae753e32556d7d2e148e334b0635e2acee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b18864805ba0e8eca0a196d651f3c176b02161bef58c1f4854720e221bb6cf73"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1b249aebee583ade3e98a276645e2d2d8f356f9e9eef5edcf3c43e99fd17e140"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "24f5ae34d1de8d0b1858e9b202b0c227034cd62d1fa026df637d596a8e49214d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "361325f45f40f1470a788cf1dce4a014dbaad39ee5407f7f3fd1af9bcd9e298a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "30835e85d3a8a2b438c90414905ccfbb78cf2cbc95b942e683e4d3b631f085e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8c46c404a73582ec1d89704f1ad942b3da0fa090d136074c756273a8d8dc75be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "64e870b36b5b0f3ba4c8745cc05a1eccacc54ef70ae6253396572473f168716b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c62c25d21c1adddab217fc0a0d9fbd62310211fd161d7827061bcb12c0769488"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "4ce7c905f0d5267fbd256dd5b8a6355d3ca76f6a7b60f0dd94accd56cf8b44c6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "71267f1c2bf229ed3ec3f0c95ae02bb8220477032daf77564ee17969505484d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "34d28cf86de04141200d4a1f246168b9be2178bfa05c781273960c9d6a0f363e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0edafb25c7f9a3f62b24de39d10c6eae3dd8e6d837bc887fe3b9f6c330291f30"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c0ed90493b7a2b273b0e87ed4cfc19046acdc24533babe0211b7c054c3b289de"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "be09b1c501057fec80eedf51b3d87b2ff6531a14fff34e0f3d723c49128546ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "247d0cb4119942684afda0e4203e2a1fb0c9be6625d775f586799b62ee742aa1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1da50cb1f8e3b16609be48df759e045c0100c3f3da4990c7f0b9b1643cd4c3bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "59623b0c47a2e8962fe2bb94fa611ae50d02ad34572154596f25a13818d5cbc0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5a79e38c2e585d3ba9e9cf900f9e58d52a552d44290fac75de6b87a7f70d5da4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "b731302c24a3666dba67a9075720e289169735bbb6466f352199b93a4e306bee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3b609896609f056feb9d7327a7a469f7c583301bc4ed20dc46ddde101f145c75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "3adc2e52f878532cdfeeb000d703f61361361f8524e28aebec2aa17a7e8dfa01"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "4b4bc67c6d463e5b368f253f21286f3d45305b702b5c29f5d1f374e3fe4ad991"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fb8855f2d36407e75195b47c6a973becea6f3ec95b2fb4191cae41915f95946d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "b74a6335dc835528d8c8c2832c41e0fa0a631d2bc5a483226f465e82c59e403a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2b687af13ac85fbef6c26bef425a214fd0b554df82ff00c2c4154c1d9fb6c9ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "cbe37f7e147438539f1e978f4d978b7ff157af563a829b0198c7c979f7687bec"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f5b7f3bbdf972cfe74c95bc64cc98adb6dbd23d0a0ea9daab98eaa3df3c84f89"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "50fbd7f64d6745686a273f576424ffc9a3efa8b5b9bed1951bc0f53e5dfdad3f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f93ee8e710cfb582865690867c2691c96572834c60ba55e24e42ad9e98b365a4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "65a13b8c08996eac65a91aff0bcd0ac4ec27eeee713c9ad3551d6e023112c361"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2c86eda2150b248c86c20321218d4d30cb2460f57ff6a51752418d065b12de43"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0a6a3414c212eac65cdc77fbb424b9850cc5d274971bb1ba2cca45a2a6f28656"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3c33b2d4b2d38321a7492a2c690fba5fa73e3df6ba9c06cdf3ff4e0f2bd1d951"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1876e9d28235b266ee58627462b88040efab4fb6dedcf74b7f57df23b550a498"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "147cb598211a5d4fb90d7a1b88f5415989e09432be2bbfba220183d5e88540d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "80d37bc939a1160044e1a583a9131566c2ae2389d63cf49f2e9d6809804d3c89"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c865786fea6141651a650fd3e815d1200ef1b2a800ec06686461f1dbee40cb82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2a5e59f9947d5e73c24bacc04f8f02286de51954fd4f4c051da2711694ad1c56"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0071a698d3b8fb379821efe7c672607002e47d5b2d33e4a39d286b61b8ebf1bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e46c7bfd0a8aae5900869125c85898fed6cc8e66724482d14d2186ba8ed01208"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "046481c7a549a5476b27c26c80491990afdbba7cddaf24ab213b51d2ec6b06a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "9b45d1bf5cafccb729ac8a0e9ccc0442f4b998c7c66f03694f1f554cb70d13f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "36f3986de1a0f05c6947ef03c077bbb84d6b76d7d3bd6711e4c2b1b00cd4e1c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "e3278691104c152379acc04915cc2ca343de07b8b89646784fa37f878c8bb23f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "dbf975f77e591d49804b549ab73ef71faab1fa0b6764588b10a6fdf33aeaad07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e5d55518ea22ae8f99fd25a8bf914d219af44d3377018a650b4e25bb440416b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "5dff76a248850f21926fbbc4743ea97b117cc28c024a11eb597dfd89deacf09f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b35e2c4e28b17bfc776f3e9b1acecc78ac2dd292c0cdc62465bce3af2fe8827e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "cee1a9f7450d0b1223914549a33fcea0051f696b8a67246389462419e3b0f891"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "ef5fed8c784f9e0b1c7e33cf1deedafc3fa2d2802063d817f4928a8c0665d0c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "f8b017983fcc8b8b083642efce6ccf303f9fa9aa7a456a3fc9c9d115c9e149b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "8e3bda5a1513219fd5803653c8fc983ee2972fbca557f2dff02f436959b531dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "bbbb711ba069dfd140fff4b8c04473a581e2d1e4f17694cb76d881ef7257187c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "939aa017d0c091cc410c0f88b3d2b838b73d930abb4128a77f356bef2f52cdb1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "03ca199656a7070d37475fdcb00a934e55352822a725f327ef9b7fd10aa66ae8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bb0780e1bc71ef95f4acca0b0aff0fe121ae4bdcd29e4620734d08830fd6ee21"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "21ede5e61705fb15a366dde9d277dc746ef61564117464f6cfd44d4839e12f13"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "807203c79853e0abbf51d102dd63401130dade11343cd7c664032e3e3b17fed5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "130f5ff9fa30195fa9021b573a2e5da8907e7f7c5a94a45c75e35e764888e5da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "26f2c18fd3dfeed595dd9440690913cb33c85f0c2da1d69f6983c64bcda00698"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b0c53dae2bb8e76e9096f8d397926c02aa30c9baf1fa4c5cf2f33c735d2b43ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "8e95a5dd8c2ab0721163ef513cd96ee665dcb468d353ba2ccd4950c70296d1ee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "6b8cbb5559b1bca1a3041ea70dd569cec4553efe04697a92fcdbd869280e8760"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "794d7a98027f6cdc5806c764ef66ee64442a6e9c323d65b6b2b07e97c9efba1f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "80aba4f3aec5ec7c63c98d191ab97484f5be70a75ea2c2aa3e9753bff76b310a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c1bc85572ca504d480d30b9209817e16a8fca97a276e1ac33b79c2c3bdc23af8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2bdd41ad578b530e1def5e00b5e414e7615bc8afb76afee38b8ff3f1757b6534"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d313c075bfdf92c6d6bb61157a27c3290b7eb0299fc3ed10237d551ed841bcfe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4202e7d752a0a776eecb416e622350ef7eab767d0c4e2cac8cd25c515300775a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0b9f53b680c7c08c0c74e5c04522f612d3a74a316f4b83835511c16346896156"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c059149a81deb0ad820e6ed77f98dea5d87860dc3365609d2d6675fe45d4fa2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "28e3e20a1ca8cf2865d92c8c3ae7d3c460d0d3819bc04596785c0c78578fe37d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4e00049e587c58e146b9048b92e98595b2c0e4a07b0f0f21a609b5d393329362"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "01b0c06fe4ae37cf2928d5abf81f6bac5a467a70340bfa5ebe924a040b9d53f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b3471b43a9b0a822b64cda739671156880ae39f3225c0fc886a0ad43712b4929"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2b0cf317efb1ce89239d8db078b15b68587821c0529d2a235571c4299578e010"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "fe57e17ef4a9db3918baa72a9df2b0113c92214742c5458d9cbe534b9bafdcea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fe5f4d800b23a9c4fea07fc8017e0acc3d37bd45035f48a2fcddefa206f66c1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b17b5904947449006c2455b675a3da85cf294be1d03006342ce2f0ea8d99199a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "79f5206bc620abc894bd53c2bb732aa90f6ef7df57edc0f4b5f7687baf7835a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c77e9870d85d8d4f8709246e66f36f6b18bd962d976c6feef5caec1f1ae6740b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "db9a2c00122cd18c8db12f1f35f4f087421757efa695197caf782b06f9ed7c52"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4f27ea4235d352e3ca4d6abae57b89072356b362a5aff5ad24804898ae45ec0b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "21fce85bf18719f3212d3d1c3cfc815b78aff385c85b36d90db6c6bca2b877c9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "aa39cf0ce7d69baee094715b155d983f1cee5f2227f6420efad249a66950b732"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4bc2d3e8949595624d7f6f484160c5121433ea2e0f20684691e6b8d596b600af"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "eee3f680b598e4856ab2dd273636543f514bffd28116431890f5244530e0b680"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "5d77929f3659d6fa7ee9f65346fac6340b3b44dd0431800ff305ceab1ce77b78"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "16f705e299d162d0ff12dd21180e173d50185bdadc9c1b18f4e8311b084f5ed9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2b62e5a8261542eeb06af1475a914e670c5dbd62b481e3fb71283a476702c327"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ede77ae1bf1ee5a1df65fe6bbdd53d37e730716e102554b4ca902f0616ce6fce"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1b2baa6ac764b32a0a213754fa502d7c609eed94cdd79bf6f3d7a5c9fb67ded3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8e43a1668a79a023bcdc24c79a17a5668cd0851c58c63521a1907ea6b4b08e27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1171d9c7e20c1f9bfe268b9c697a065e4e0dfb131d38ccc0bbe77f3e7e8a489e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "1338ed9ddc2ae62c9b8e5af582fc635e8aac6cddd9ebe2e564ca6ef0db8a7aef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "818be6e569751426dbc2e2143d8a73a096c0bc4302a12720f03ed0ba005d529c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1316928dff4f0027f8db5387873b2a6d7be54c1a40be9884494b1b6d7d116bbf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8bebdbe303ad6e43f70bba7c66bd13fcb2bab60124fd12259a27bd04837b6808"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a75eb4a18dd69bd63f70e7b067d11650a9900d57fbbd8d78d3f5fddb68d762d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "dea350a87a2a8f247692d250d934532a1d09dd1750620a34336738df89f1815b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e21f898b5654a6710ce6fb693a5bde47255f625ffc4ba385b2f2ae47ef5a0ffd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f94a8f3ac31a3b8315bebebd9a01daa04efa58dd8fb999a0135f9b08690d0658"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "c820ab53e328d9a1bead97ea0d62a83d83df22bf34a3152251ad543ea826ae5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "753b73114972e4b96166cd46e2410489baf6b1fe1bc796f192154298f92d2646"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "f4f1ff9f51cd2b6559c50356e212a1b002d54f8e3fafaf5ca358a2b7ec1fb584"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7ae7d0ce89b57a04add9ab5e5a25ea2e204916d8deac25e6bf69db717274d380"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "127bf0a765369b103d3fbda374c94500777984204c19d3237adc5e7024179391"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "382d98836bf06b9d8749fcdd95814f241c9ea600e95ba17c403fc42a735c69e1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f1edf7a596808f28a8f0135e7bd0631c297c4453549eda2546d02a6f2ee70466"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "19e1e67e6b75c7fe81e181478b3bacc72775ae3a7a6aade3cafccb2e549f9e4c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3024d274f1d34997cdec83caeb3a67a9d7b7b409c1056f5a4fdb6888c0b353b4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "277431b07e264b08c91cc49c03a559309a30d448059da14e0cfeaf06a470b4e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "d2c7326be4e9c14707d25f294d0b761fc5ec63acbaadff53459bad8b6d4c63e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e9f596f45360085bb7b5daa8263fd00f1eea30b573b9198ef67ff01733757d10"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2b6500b62cbeee9ed899efd411c676786fbdac9e1ec2d097ae4444e4bbbaa455"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9645ee469dadbc6266ce66afb3ed50a098f691fb6ff766c0f39f49422b59597f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "05b50f88e583146eaaeccfce79bc03b7bebc704311946c5eb5ba7737cec655e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "759cfc74d3ecdc97bca6e27999c4c9df424322c6673cbaacd01eeb45f2f58759"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "60bf85feb725e52ac1325eb35071d48a1a5c86a5656070716c2cd9511c4ed869"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "b9a75b7db734128a4260bb2cd017b787a08ce3e339d6a9612401549231842586"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3e79b1e7f0e7dddad01116c5dc0009ae2d4f5aa87b5c9e8c5a1015b5b3c9bb51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f52d367b59b9bfd0723341f01bfb540c03c7b4dc5d494b72cbb1d0c969f66462"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "6864110c9e43761861b7c4f28eab883b45e7d6450fbd407c953eca0a92fa6793"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "683e09776e1bf6c820c7db004dc833ddf1d3835247ce11ad1920b82a4978b66d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88e8fd790e18f77755ce51f1ce59d8d7f63a9f07814bae85fbefd4bd34e95546"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "1efbe2d7635606b844eaab08ba19e61190fa3271c0ed77e2d2787841cdc52dbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "dbc8e45d8e7d9974f6696f1afb77a043af4cd134e0ad5682ec89e6bb47abaa33"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e49aaecf723700c1b15d9d679fe9bceac927805691635922f4ea407b209b6ad9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "47af3254e4ddf6db7f42694c8bbbffed6998659e87527460f9646c714d4bd9ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a0fd296f7ea7e743d8b090371dbbe9cf28a00614bd6d0d1607b4ecb80cd3148a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a5bf0563631360660e370f06dfcab8b7b33479863ff76e8350af3feca99f3287"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "697f926069c27adc669b429d0024a0e779a81de4f79aefc7b50a780a9dd2d144"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "f0cdb4c71b6a1c0d9b84fdd419d7df3f51ec997f1d7428237d4a43a6c1847826"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c81405bda5832155d838db2200b1dca3125048f85969e849d9711caeee3a1c62"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c43b7523b32d781ccc40c7f43496a11dcdc200ca506017635ffa95ce440b4e0a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "175c2bc4c52519a19f97a04fd549a15d72dd498e1974f2bc0b6a733019331b2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "cfc315b2dce06e4530fe33cb03e4243cd4721118139a247d0000a58e8e9bebf9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "0660c14b7eb4839b113628e6156477845c783b87b15ce5d27e4ff878a4f2ebf0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "d1eb6faa2a4558d582cf980c62b424164c85cb9408fc4a6efaf95fb913016714"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "a05a44375fe8a29fc5d9dbccb7677ba9fbe0efe5531351d10e049ab5c7ebc2fe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "afa9367c3268b444c4fd2ef89d18202027d8c6a26d76e7cc54cbd5e6cff035f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e8a801376716fcf6aea3383403397a26e40006201d1f24bfb958ebcb89662b15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "2d178922a19dc479a4bb91aac064aa7b565fb9efc93f038a35491ac7a29d0da2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "094dac4d344f93dd59572d147199e12cd7cb72889cc85eaf6563d8db3ba60f3b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "4ba1e3b60cbeb39611c0691910db32a65018e2bf13060dcbeff223e955b195b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "46a0a5da461607999bd030a4cc6a019b8fe5e59f8edd21c7a066c1d556cb21b6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5969c0b9e6aedaa2f5adc9883b4fd58e543f315b48ebbc8f379d41b86b1b8252"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "6bde444a74abe45a64668e16d9fdaad5212098a78b31ee70719c1382a10f5c2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "6720be0df89e1fbcd637593a1cffbc8eecc935e6b958719d9b5309f3dc0420ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "692ae2c0532556c70705af9583ec3a88d11415de8606580e8751d261c56d0963"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "77a609e53aed84cc1c01fb1037160e2261f4b1f4b251b24c2a1139778d9ce9ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "8e61e9763737db6abb88eb1ddc9aff5393637d0cf2b184af65d8154545c7bd74"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "04434f7f849f293e609d15f8efe56e9daf2811661ddc7b2234fb3b69d44bb6a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a6279bd2cca629e25eb975082b9317c4f48b38238ea57a73186bddb709ca8452"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a02655dab8bfbdca6565bf0d84b793bafd23cc2582f764c3fe57d4e95a685479"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "085631508091f8b5d0527af168a369bd23715ed514e9953f2cdb72363786bd2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cf335ebc091ea120891c1f3e08d6ecb003c87356e608eada590678ac561df3f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0e18203e03c38b429b9189cf2c6076469b40c74d2675f0ab7cedbbb78c905313"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a0ae3256b54ce9b7c60136d0beaa1ee0ccef3905cb388db2ed9e28f7a17cd22d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "8374df8cb128bc5d010e0e43a70af22492dc329b10d62b9c1c137e3f0821236f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f7cf73f659983af72dbea61eefbd040ef153bf9bb0679a3d1ac33518e0ce2917"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "70bb2f57cd2efd3338fbbdb17610f1a7dee066dca38571bf604564962c066f17"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "44002984b3487826e2ba35aa91ae025e3c3299facc047985dacf4fa1f0ccc919"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "4c067c4e9ad758dc51a10b90c7c210e0d06a31f8b516a56dede9034d1e354ffe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b95054343963c703ed6055f16d9f3c39c015fb68f9f5ca57fb55d2c5f2eabb67"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "89b11cfff7cb6cf109caf757925b0d9d1488090d239c1b9ea610ae6a298bc0a3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "443db7c4772cef999bd0b34e97b9f9cb7dbd10f64b2c40b0b4af1e9ca7dc5e76"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3e0edfad24226ed4a1b876b4a49e14a996769e5e63804943c4048736b40a17d9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3a0f6a219ba812c0b33338e7abd247154e3c2e16657aabb6ea247414e54cbbc0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "35643782a477611634024766061f1c34b481886d12b872af502c4df11447fc6e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "27e723edeb2c5b3d2ca782944720346e42766b36a3856f6594d79ffb606c676c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "9987096f28383af2b6058994ce2d5058d8bb1e11b2f7bc2ef4434e70ce9d35d8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "953419175ddc9e5aa8af6cc18e69519d229cf59c9cdcebd99cb6d7ec5d2742c9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fb48b5adbb371b7ef0790a3b15be815888a59f0f10a68db0bfae871967201f9e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "cb47cb4763daa7e309c1fbc3f5b58191408b61519b6e3d4010205b2af2df0910"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "0a659ad765284cbbb7e1026308446677e815959974c2eaf610d22c5838d8c139"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "d09e3a1fc09e2ff81f252bfe543ddd50c6993022d0793f80a6d5b7bd2da9c0ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "50d737d582663a4cd060396ada173ff1a09350f94b7ede5ebcd960b89b65128e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "427f3a74026e9f3efb9c99267bf3de0b9d740a9e7016732b4bb7a0607f5e9b87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8601bd3db445967017b396176b4764140b8fcbce6ac3a721e3ff869b96bc5971"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "eaa544b3b66665cd162d399d11af7e3c89b5043520c8e523457396437263c456"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c81069e7f9e8845c4e8fa34dc5a3a8f4db8a25d05804d44ae71023a9aeb0394f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "beca22c309088755c664ee916182cbc0397492b679f2e39749c880e2ea2df844"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6b50bfa23a2d505c51a260adfcb46ffc8022aa13c6dcab115a7ed8e6be85c96d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "24c9fa8d9ef2235b6f76acf8e2a6d5cd7d13c4e4156c69b5ddc1aef45eefe42f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "d9a4e9d8859139d4534c165cf2b54f10fb33b0040a03acfd02d9bd7ca964a3e3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "e4401d53907090bf26cecd6dd43e4e086e15472eae12e413e54626f8b8543a72"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "324a6d1378151d904d5c536dec64aff45cc6e7364449456ed3acc4bc9957423d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "c00cd78883d17b8f6955ef4a085448205434a1cee0a57bee5848e64849c18d8c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "316dae9bd4e97be1db9262a55848b4a7be51d75acea138a57654bddab6ee50bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "15ad1d9099d2639e30058ef0547c5264d9740c72d457be3fe61c01ddbc466143"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4ca9e0e82f01db5904d4a4f42bb97e113b9f6f3e25fcc93670f4a235f05d8bc3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "941e50594eced172457b81b0558375f216ca8ed52e38d1d01a0277e8dd33f862"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "83dd62e7b139c646284e1f02822716ea668a652abfcf481520267270908038a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "99a07de3a573c0d13fa1c988831e89920cce98568dd3f8a9dda1cd44f531e28f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "aeba8d8e70d7a5831af536e1ba6079403986e3604c75832491687fb246f4c91e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "be210cb25bdbd46ce99555d444a4db88303ec8c388897287d68fd14813d035b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "ce52894d104591b83dc04e7dae747a4e241fb8312c67a1b1b21c55fe460e2caf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "ef2c185f82183d1656be21e4ca66708a8915967f64d16941d5a8918b92d56c13"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e43fe55166faa22face8ecf770c824229cdf76019eb32a8fdfddf6d66eea87ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "bb51be28d839c8f6886609c115114342dbeb3b4022cd540332c459267aa7905e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "2738c1a8080fd9121c065a23921389020a7b416b392e6b85b7f37e8f22e9dd01"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "d37dd1bcc3c7b6bee7105c4ffb0bdfa1acfebdf9435f075bb439089d1e7d44f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "6beecb7b05df3647530e987a7fd28ba646face9a9e1481098fb2d628e384250a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "fc41aedca2b234797e537a0df1b8b5f787002299c2b23e6fd3fd28c5be4ab204"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a0e3490a9d0a6bd551f8fa6ef1c3ffcfd3c769b59bb91167ad13faf0bc3d3688"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c0f47210035aaf2f9f8d0ce3d3d4e6d5feb3d8ce6ea272dd03d7b04ae28b764b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "c21cf7e0a34bfa5099583eea10b6f8ab59e965d81e9d99c7940871acb97d27c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "23fc31e8db085ebf0431d001f1f4035a3ce983ca1b71ef262e71d71b26f8968b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a5d3b697e393f8c3d2077db6067946130a6abd8b538bdcdee65ce562b0f93242"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "c5ae1676d4a69ca94943abfef90fdbe1d703c690501367eac4f96f10ff326bd1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d4801911f3b74eef1a279c2f1d7985fbe3d327a989844b403eecaa9076345a07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "431e6b546e4fe2b0735547ff7f3faf8c04777001cdf75c87a609f7c72f2048f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4ee79ea8129306f5c5429704cea0753c3f62bee967f3028d946294fdfbd52e68"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f0fc31fdbcea8bf73b3b011597d3d5d1bd2dce8b0f30b2d4ca52d9cd64d0a5da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4b79660567c5e062670031e5750830a832984d59159cb6faf16c94e926fa5996"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "835bbddd7cbb73e69d1dc384fa1ad70b5961c6e93caf570cc34bd7a71462cad9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f24583e68b39666888cf5d49923dc7c850c867991171ed327ad04979eb3b2b6c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "560ea49625ab161e9005ddf0075ca2cd863273c7a8dd48a79f0ae0629cae091b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "d6d2a0ee240a229f3f0515ac66489ebf162d03c80bdd2035e8223e28e00910c3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b0d5a1bbacb2a601b3e5760a158897d41f066aca674d6a68a1a1d1bada1fa3dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "ee74dfd3d91aecd13089ec0e15445d6d198fbe54e627189d90adb04a4180fa9d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bc3b77c5fd1b71897efb09b25f4a8a7ae96d12a7b1f9f9fdc2eafe7cff900694"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "40c36f214d8558ac8a6e4a67f87e8ce2dca7fc6f1323fcf9527dc68cc931a358"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "833e8d90357dcbccfaee14a719e76079cb472bede51d01a2ee6b31b29734231c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "5bb472ceeaac3fb05dd8d40fab23a7da83984b8e77dc16c691f0a124bf4d90d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6416bb725afac4353f57214bc2c49edd76636dafb87638f18a3c378097cd7a02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "0164d6aa8faa8247653cfdf5d0352580f68b3268c111387ccaea2b8e789896b8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd206d12a115ce9666635d9b0193b05a4b4183a641378b2b37ca1f08b40fb872"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "7874b9e358e9df71bc046aac4c514bcbdeb80707dff0d51a779a6508b9c187f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b0a7a2e61d2f1a7e75a679b96a230fbb6d736bbbb450037601e477da80b5f5d5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a8ec2195ba70f8c4739fe776920dd0289e748b6571f7b972dcaf801e1e5e0ae0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3194491b6b9804c70e2f6af262391325c646b7bc92c48b46e321baf704e05fae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8578d675f92487b9db8de6248f5b4c69106e10d5f52864bb260de44e942c9caf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "716d5e10145a4044ff8060c5d1fa550d132326314bb741ca7df3c428ace72ddc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "58fd6b496afb9954fef093faf991e193385dc54d16e93dcb3bd95547c35677c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "4dad32e85fbe4fe484d8b8a7e43cf93b26f189170b50165f31a60f0c280eee0f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d281b2f87d360aae0264cc28ac2c783d8c4860af96f0a18d32cdf83be9e5a900"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "59a3bbbfab9493494a7e27dcacbf5301b6992b0e52579ad8a1222f3ff6060955"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "95c2999829134b5924d75da6fb136b7c3adeefdb74e168f177f78e758ec86102"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "76e78c6a16aa11eaf1c23d4e0e3d65d8b97ed8a80bd37bdf895fafcc9267c864"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "422e5ba78c04a678a56a6bb25dafb1c6c989948d01a3c20a5ed3ad8e6a646dbb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3273c932519be7b44f1b951f90b1e8ffea147b3bc90ffe6f8efb5f95ef5f5d7b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5b3f56077ad3f52f503d795f716a062e9f3ca0fb7fc5cac1fdf871be7b4803db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bfb2cf43114fbf6ed166d5c4fbc945c34fb42aa1c2fc6aef6aa58ff402287ccb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "92ba3838c0e80d405dcfbbf8cc076a53a5b0245e66db591a872041f90b3486a4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "6da3b8ba30f8346857021a2de778c7c50fbb6fab34bc2924e7e951943891370b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "9eb3b9c1aa4c3e0b0f02a1df92f9717ffe81f50d325e7a07c8d87175c3ae684d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "3fe70b95ca042f4f1ef60682dd21d07838e0b3610d369a70c344934eda8c41e5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "71b19fe60b9dfbf3dfecee6aa3f3cea6a54baba809cc891395b8b79afa59b03a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "e8c0bdc87e9059464093a0406b84347fe096c807a12edd2a99b5bef98e10da05"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "016480abe9738869a550c735102a7ab1a30c309e2fb779650aaddaaefd74a52d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "5dcfdd1ff324b3222793a9f7a9208f19c5f7bc9707fc625536b76b17629f129d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "c9a67f62184382a97288f8c859b560cc23153320352146d185a72efea4f49de1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "c616ba60658581eb371472cc5186e442f48142d071248198dfa632fd62a5eda6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "1511e8e9bb6d33c3d1579f3781e9fcd79daa6531c6fb9e1fd9a136311b6a4a56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "7df38f14dca49f547f01c238344ccca8a98f24c8279df0201a47e44e8192573c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "854acdcae4b3c0d651f2af71c84eae63be1a44af9bc92fff0e67fc62b17936f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "fc7a8c3171ba1b5c31de122f31738b8f4a50cb8aae63364a838154486c34137e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "dc49e5ac658cf0642e044722309397981f31627ad22368e307848a12fcf60034"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "a0accf802c14cc8b17e88670b64e54a3142c768e6b0a6bdf15eb7dd831cebe03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "bf77e551e83a4e2941765d645a1d3913a53732e2cc0e1e881fdb4d8e702e114b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a6ec500f0a2ccbacd2fef1c396fbc54ba79059a92936efbaf33f9abb4ee0958"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "007932793b1d5b6adf62dd896e52ae320f2b3e0b74c3a1aed67e1265d36f00df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "56b565ce6d1521cb87b7edc587489064a4a03b6f62d38543cdb6600666dbdb32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "dc26fb3f030ce3552eb2af74d15866cc2893699d4344c1ccd143ac066b0620e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8087818a17c47131ffbea2123ae5a8fc14053aa34cf788ab3c6f24d70d85fef0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "78f537bcf7fb0d374ced486ceacceeb399bf2181f0df2b7a196e0aaff4f35501"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7f78e318061c8b783177e9d23a9cfb9a0ab84fc8751f6ccc4625510cc08ae9be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "fb6c64d85e946c3f0142bcc7a977f2ca8cba54ad72d63bc56d75f1577948465f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "455969b9b20c44bcdfc601aa32d4aa2bad6c65d6004470edd2bc6889568fcec3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7af88e1c5982a79e1477bb14e3ac55b858dfc7f1962355b3402a5b7632843632"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "64255add7c035e541b415f13cc072b23b14531fad967914fd602d68e03da460d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a3207443c61baaef52406cf85ed3c9896dea3e3212cd625511e8bed6d3d92f55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "4349ace32a6e99f7638442c2cbc642aa740b7bc4599876e1cef74ba7c356ae8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6c456c541b9b8bc0231c509614123b447aef76eab60e2933ca63498225ae0751"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d25d3e0cbc6d9e9a323085b5d238b095bac5bb2f390a1b912a8ceb49de30c330"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e458b191d6e2496553334ebe43cdacde01dc95d8f1f13478c208901bcaa0834e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "c6b797c19ad92439cd1a4c8d3d844cd389b41303874fb5e96de5a6b46cb2e568"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6b35a6d7c866a483b99a66baec5684b00941987cb49af49e1d613e909364988d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8538f5c03fce4117c09cc4bd323a1c9361e9ab21bfee7e844ae3c9447fada728"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5f015198fce4731158127fa641cd72bf2bad7674d3d65e5d568b523c0058ee38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "c72e0383ce3d43a7b66e6577ad212e7f7c97f6b66b48478bf26739d2fbf0ea4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "812d660c498a50e75c70c0c9c8b1419488e48c6c3ba096fe38555e64e8a521e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "96d0b942580095826eee683f6c9a184f8ae9e67e9257e4901f878e71162c2a13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "47ee2efcc7b18b7b5bb0216b930193a17154ef6c33d147c6fb64121f6b4bcc87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d22f8354830fdfd4d8390727efeafdfda47eedc9a2d5640c1011c21b03b420c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "9e392161f711083904f2d9514a9cf1f087e68bfc2e529f8998f90134c14b7a5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "111912d97906e2dce4011f070cb2b39601a5286ca0e6026235c6fd3f40215036"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "ab6a1f2ff9b3f432b39c626485898e8c27a1ee771ad056a300083e8031e887d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "18bb5068821c9544a80929955cd0102868fe289141db6bdbfa863a47170f3f62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "45d867f7b6e871591ce12d8cd431aeca024a519a5115213c3659a79331da508d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a6ceaf8f2a99127cd846458c4043ccf000c2cde7d45f60561a59c1634b447b4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "efc3b5b4024c58face414ebcfd0c3343a9f13de08f2e0b483802cd397570d45d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "61b4bd11da89b1cff460cbcf3ffdf125da09738e1d2b8635dbba02a981a80b1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "4ea1f5726ada8fe473cd9eb05f5f5df9fc277d28a5a24aad313ccb9fa2d52f92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "55e6175e1b2860225521062d765269c3baf7a1412c60ea4ec8c1d7fc3e505e98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b78a3717dea99d90fedc6a8abc77d0ceba99d769b338260f26c1807195e6abee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2b44d6f3e363ca9d45fae830422b3969e08aaa9b43c97b8c97b1b937b61dfa81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3d9a5a48c67e886580fd58b49322696e37196808e635ea1b4e134a8b4a202905"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4cd1831f36f4f8d7cbfd3d0274f26f40fb837c161f06aa200f177aa2c43b96d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6932515c43a8790792d9bb5eed2766278e7aff34d43afada16188db87f8cc7a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3a1ad9753348966dfdf7c7d6290e90175728df1dc0e8c9b743ff015b5a55402e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "772169906011fdfcd7fac9fb7131d928f6432c1b0d424954f1e2b97c8de8ae83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3e53ba3cb4bb5604d1b98d4947a96f060d04a8fbedbaaacdbfde5eb482cb6826"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "7399e8a7e2de6eae182e447fdab8cc4302727c1b3f0269abca908704cb7662da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "31623824824e3ae5b20ed82f53b265c6aca6c46ddc13ef7d14b6ae20eb661c1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "ee18197560978c80149bf0fafae4c6b9b3a3d9b1672aca8b2cac09c49b647834"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a0221752df029fb863f506bea5a959673b39328cd0499dfffa1069474c97eb99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6009a1e61fca6de3406b3e97f6980df5406321237046866aaae65009d9729788"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "a5bdaba3eeb9bb097a8974d8ee9ca2d6cdba815ac9733ecf5eccf192bda2bd0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "f2005b3159b4a341d4d7d8a955621e8d4fad4781f41af7279dd93259c89cd409"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "68510b547744e583cd84763d6cf2882365c404eec336aa14b7fb18feba2b0316"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fe6f16cccf27935fd265df62e0031b33fd380e8a38f1fad136463b96bb50d01c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e30f3023602e9ca3650ac9a85927ba020727e15317c5d74488499ec6f02f739d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "da42cf751eb12d3204c9a6161987796c4fa3893cd97a6d1c365258c825775fb0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "7a07a4ce96cda87b26132b8169eee7ea4dcced32968780fefd660e8a34e65698"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d6e894bcaaabe54a7c610f5492237de1146378399dd338ccacdfcfc34d63107e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "3c1e246b8932e7f9f0e728abf2b6781e5891ed342f47ee1f3dba96b1ebefda40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a0ed3e278c988fcc1c9fc204fd36fb6cdf7953bf6da7929b5227b2f68b722d28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "701ed326b5b6ac81915ffd8ae4ecabae1bc0909a80d1ec952a67dbd4ada5a893"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "841b083a41b53b7e3cc1d61d3f0c7f6435be5afdaee6badc94a5de57478e6ae4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c7088e5d807d94e00617662b6a12a2bd3dcf64ce808afa53e846a6c1b26009cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "63408df45ac7b12d6e34f4f293f5122c4afbf951f51f65a3c8a9c42bd58607d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2fd2d3d8768f537428ae2edc614b1612656b0b3b58200cd05d2cd19f1bfdcc5c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "4fa2a200270964ac5066f9deed0c8361ddadc8fb6d7beb677b160dc0f2108356"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "98429fc9e856ef519cba48be026b823c4c55c0a92f2606cb26cca330ed88514d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "b1d788a0765f8f2ca5ff2b1c128498d1bfaced26ab793add44c45e1b2c7b357a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "4a5a271ac2b8cf1b8c3cf82eaa874a320fc263f814d8f808fc70867fa176bd24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "646a65e01dbd488ba3c63ebdf52c2ffd48d4626af2bf111bdaabd93207d4df81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b24dc21ad571651b0ee8a830a2deab08faf11af80030df456b50f2fa9dcf1efd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "60405e8ab9d66c3d16cc4cf3719f483e5467a34fe9b4034c74bdf12b9461528d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c5ba3db03f0c1f723ea0258337c8ec615cd2099f0cda2a3aa54fa6c3d991e382"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "feb87a6e308701407c51ea464f4fe1f076d564c546812051a59db8788523aba7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "de6eb29be48cbf3a69eb0a49259fe668225922115daf1f69c5d6fe28eabedb89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "92ffbefd8fd6b0287aae78550957f0d688e1c6baa8153b90652b14f65f545217"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4f7d2c6f723baf7a9b0f68e6caa2e7b32d0ec8e6831d1c02086a160a95e5d89f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0bbf50d17fb58bacf98d0d9977822f8603e781eeb1853202b87dc91522f15801"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5b79e2bc01960faf161d9c8a8bdf748b4d1cd2d2631d1ee08fd7940da319902d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c56f2bfe48a95d06dce3b494f2f611e566bd353187fdaf54c24b5915150c7682"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ec4a7734ee234165ad0de1feef98732d9d09d05f04b4e725ebfd7dd312ad446"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "19c3be57e55dd359452c0967a28b276033bf1c319e39bb1420fde01ea73ce31a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e9972ef39342938429a996868fc2f3e17bf3e9fc07f63ecef12a77ba8d333de4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "37866d288361cb8e3704097dd05d9e1eb792de3ea223d4e1b1afcb41959721a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "9d4c76d8fd6c425d611c2cb27e703ef8b5edf0c1d0582beee79879430827593f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c55973ac5d52eff1f8b559f7bfc5b8412d32f5bad696f9e3dd7dd732e4852de1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d3c3f88903c271c6600aab7800d42003b57d73f60b37c01b6484abbb7ff6a313"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "2dad2e5333939d5136e39703e6acba68be5e3ed02e54335901ec56effc78f257"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3651bf6f4dcad65f8be618f78218b6f74c2df66c27c8befcc5be7cfeef42d989"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "072bbdcd4b5c31b754d125d202edb8249044e91830c1498ff7383e8886bbcfe7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0f403f443af2431b5848f28c5d2be3808a8dc368ae6520a10e5dc0b44b07cdcb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3182396f02cd2ffc4cceaa0a70c07f997a74d67b6b3328eac4fa78ef7b26de59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d54cf13223b10dc3a7edfa06668cdb0b8be76e56785dacfbf75a4d962122d9bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "ee462c57b76ad7f52b4235887802326a680e566fa73e114bd47d9d0090882e00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fcd37f7c6236fc302cf2fd16b10144829f80d1fd8718a027d36e361afb5a3ff2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b99a1087e1b7c058d343bf235c8f6fe2b853e1dd41c2b2744eb41755aea2c119"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "b197b11f37a3311594c5dc1e5729d975be2854bca587435fe9148f83c8df4616"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "dddb3a0b8a608827d7c2472ae617a7d58b9965c0d5c0eb6ac2201ab0c1d0ee58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "20bd6cb0deec331c81212e9ab7474c075db10b72766b4a0399b4d57815409765"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bae2ce0bceebfcb3bbea5117f73709e9feef3dbdceb71c85629d5c523bb62e42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7db9e58c38b448865c730e5740248895d71f55c84bad845b7e6678ba9cf1d8f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9039a7982dcc7a9d1035211902da0904b061ebdc2e8cee4877e6b351fa051837"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5f175b3701276dde1af21909d3b3c6166a20622728804f2de8de55b41e70cab8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f3cd36224d70cc26934b22a7e6e78461abacd9f0604a453a4e7ec697f192ad10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "05cbe1ae8d7c500cd2d87bbe54be7d9c747843d2900d2867d0f0405a9f62d891"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "dcb04b7e71196b22be2963c484fa141d580bc7a0ea52ab304f73ea3902f4c348"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "2f8d7c7a60c02a06281a46dff2e5d073f693df866759cec1e635c3cd719455ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "fdc7f2f4bf8d4ef946ac400266e340a99e8c101a98ce31880f377f674ac12752"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "55c94253060442ed04a0d3fdcfada3cd8cd808e44e0e780024bfc48559ee3cf9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "14c180e5a022fade0a58958b275d732d02f093c54e9f97f912571280a9c8b01c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "b2f81b5b42ef46715a122a82d84de6a4cf92c2aa7443ebfa3f4cab2ed54327c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "47ce44a8669e2a7ca9fc31a48ed87ae2f2699f8fd96314e681a32cfd25fe217f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a8312c1b7676bcfb5ef1d774c911ebcca51d64deb0937e476f83b4bd9f895531"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0b491f09f132dfc6ae53a8e81c7461865bf36b8c8ecf0fb2120091fa2c56ce96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "aad7fa5dcc449359d4dd079552f7e4d667d3d96ccdc8fb694f0a60cc7cd1feac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b583791ccff1f249c0c83ba32e92fb67dccc9fadb41463eab4d3b5e8c17b41f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef6e859d3f80c08513b173b204720132155593abd2a8ab81dddf02b3619a38c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "fb20f53bf763202c1d485f1488c79943d9c69f2326067cc07d6ec9dc6220a4a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6e31806cad5e7f5604a648fd357e99c76fffc59813817f95fe5a710c7fd4e8ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "bae0aa5a1142320fb5fca5d8f452bddd7b635e2f87baba589bfa1f3367024a56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "12a4c29dfe7c01581a6e587ddf3ec3ce965a3cd850b89094adcba2ee155afb96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a0b18282a1ca6fb739d676626fbed0f8bb55bf6cc2ebbfe93c94db132e9c8af8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6def0d50f19b2f1105be0081f05e2cb33d93c7244b097b30f4b06cbb61146e67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "03dcab2d211d2ee86007fd736ed51522dbeeef49755530f4ee1d27965b2326b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8a0b2b167104390c23539d847230c4779b40c7f65eec4408821bfafa0b8e6781"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "59a84caa53ff28c355a3bd5a4b18dfdf5b521b202a5e2de6829c1ad550a46a9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "936da7d57b46582d0244f74f67d493c8fb55afc73b6f4f71bbebd033df9e6d22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5f93628e45e30518847b5c3b3b90ecdbac9f89e5a627503c3ef4fd3084fca2e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "64728e41e867677310442222921b10eaa6d7ea69164c015ac2c7521a26cd6b35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "c54e3efd1d9bea58090a4399a8d49141ed6aedb0a5db1f4e2b908397a6a83369"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b8eb35b65821d9e7d27167a013ee98cb19e41b92cee3bc460d6b26fac367b3ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "92a3daa56ee64e471411bb8e7d5ebcc95ef37610f2656f3fa79787e48753d904"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "6008ff5e6ff90cbadeb6c6ad8a58cff42c365238bc57b0eb4f5e34678f4e739e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ec666a46dd0e85a7141d4bceef7cb7d49a2723f69964a5387b558c16a09d3b66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "efe1682bda26700781b470a943541d6b0a8dc816d79d283342f75a51049b437e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b0238c325551e8542046878e3849d7cc55a5e2928198a8e4182aba8a733e3214"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "db91d843cca016c7b3a6ce7848d23306cf74cbe84705b0efe5b4cfec0c1c21ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fad8affe33e80c65713543eca1c1a0526820be24368c4256b85150b49cb2b73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "93c6f93e349bad2f62bdb1d8b7e269fd2966bbddb5c806380f62b8c1639f9caa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "5f9d48e5f6bdecdd997fca5c40e90fa61088fcaa3173a8e245ce33144de5a53e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "e15de83190e190c2766ae241e223d202f4320bb31824b8edffb8ba464626a8ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4be24517715ec7c5bf9c06fbde66fea20d023c5adc4ccea86df004f37c3edc05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "44481d2273f7a6b7a5bd8161c91cfd1c47cdd077a22069f5a64d711f872a2544"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "30f787700cee3cf764a6071c6c5a3799a8eec3077217bf2e1a6b090e8c497a4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ff11ae41247c4f9c8c9665987cbd9263b380f59527dbfda1b67d6b42524d4278"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "ab7e6cd8ad4d6e651f4a0d7556b4c45f09aaa56dff09d33556170ffd1161cc1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "59b4a1eb4d802abb700c74380d2544f7eb46eb6a36544575e96870f5b839df99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "661c2fee03953f4c4d900c9b43309822434314d748379cf814783b3c60f2068e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "d5073f4dd622f572c9b56ad566b3f140e413a7e58c2a9fc096aca78d28356e43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "ece8266f96e977a4f20ddf79812632923d636e35be487d30505ed4f034b578dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "1eb4a120b40368b28d68fdf3a543ed7221712060676c0338827b0e68bc892641"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "7b4e2487c6010afffb8c44ab1afa4a5bb81c1df4256dbb85495189abdebe5134"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "9ed9768ed56d860580a465862dd3cafbe92ab6847a5b2e1f668f9bab0d8c9dac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "eb5e0438004001f2a3e4066792db4c9a4adfaf5909609ce805fb37c4d0021771"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "912766fb468111f7842eb946d77a01c01111265d82f18e27fdf4b855806c60b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "d4a76299007536dab31c9ab01ed19967b3db1fa0e9602d54c511caf9f0bc0678"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "bd94ff92cf0522dfa390314491dce1f1c01db57a5c10ff285846af0aad722f41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b7a938d51c6b403552d97a5b08fdb9b7affd4cef853aa5bd6f24362d104cd956"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9d918982cb886de9b9cc4d3e9c4f4352dc55e4ccc4dd6833ac6696c03f0b32d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "6bc1070a81603091b5916a87439a973bbda1cc76d09b3df786a226ec4a031b8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "9b34d0584bfc981e893a41efe1aa5bd33bacdc154db5f731c5a81094ee46f994"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "63b66a4a9e8c9e1a6074eb8860ab13fd6fd4380cdbf9199f766fb9416fe9fe17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a6bafee8bd7ee6cd9daf29b9d4ab5c4075a210a65ec15c44f2d251634f563128"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d9da6bf31c545ebcbdf0983b93a4100f8eebe2e59a114a3c40d92930e25ba750"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2617509ae870202ae456a1894923656aeb64694c2f9364de08d43d6e17b8c0e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4709737f0738e7c74836343cfe7e7b9287dc9778584f5198455e46144d6f8d47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a3365f584931226d4c5153d70da6755d25539d54358e702fe19c261c72362394"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "3361232b68a9cda5736680e4e726b266c3889991a0c9739d7b599cfa17e7fb79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5b3aeb985905e62e3ba766a5a7d9a477d73d93b998542f65741cd8b235ea9f4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "9b3d7a8492a2e7e014a4d130b5e43ee2db3b9c5efc7832273ccd78f287d39da9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "72dc4257b716c7b2442cc0ea517ea645556f75bfad5653117682075b35226b52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "aa2d56e5c251427c32edcab60e0cec579ce08f98bddc081b88e89dc75e492008"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "5b2d760bd2590e362fc3c37809afc2dd81b4f86c5478a4279cb9f8cf837da215"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a0f129b3adeea972033bb604cc82b4402e5a224af5fbda3618a71b93ee992837"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7ec8328efd23f0ba6ae8a89e482c42dce09ff40fbeaeb3a9d6db7adf6dc07bd6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "592be1c539b3d221f82f474f44f00a4f37580774380c06a4c719c835e5198032"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b210fff426f228451b1d7389ded154239a406d63fcce98bf5eb5ed39662a4e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d7cba359499943a46967eb6c15c852a0d94c6a32b53d3d27245e8e5f2b5de464"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8b8c4053d16abcdc55a005659b5a0ba86f14165b8562da5eed72b2b9e3ac3f23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "bbf7f0c667d57f6e79697452f14a2157b08d0b62bd75c8043abaeb3da4bcf499"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "0e676334c37660fbcc6b6c3271b98b95b41ad89fb8909f88b710ccf012a55749"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "58c57161c1e22e90b9a3240511de435f2230da5f0ab7aa44d9bd3c324c161b2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "7494a0700ac954543cd4afc779d6193cde92ed0b866b62640200eb77733db81c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "9511cdcd851eee460bc17467b583d3116b81e2881da36a6defab97e2b72823fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "51588117a0b3628acbd487f5db2f5e44e925af1061992683221d0ad5a0ac884a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "fbdae4fb19954afa3ca9843b35159f7ee766a53281cd2bb611da75fb903e2f02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88a18a30adc4f899b1c0f4b5e399555e6875279cd42f2877042fd4273359614c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f4ab38ded10e1d9fb0f86a56c40d84cacc95bb9ac7db8f52f9e111ccb4346da4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a0e2c6fb0be32ff4649d698dfdbef740680cef69fb833151e2ef9ff552b2731b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "970156cb6048e24393bb669a5b40988a26a49e32339d865f77cae02c10fbc9b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "287b29a164fdb6474ab047967fc52426ac56f78836e60493fc456dd5fb591b0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "052dee6bb49f77dfdf3bc69d755831165c6dc0863071e29de916f451c7257567"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "730418aac876dd0ff12772987d59560129c680fff5f320b049f597cd67c41b1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "0f4edc5c8017355d2f2346598f7c2f15a77ae274c2bd81137c385e4d81ddfee6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "925d7198bdbdb012a064b2e81550969ca5ad2a2b09090ce5f2f025079517a22d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "b54fe8483f8a3e1e3b385568aa09459890f63965df8a71072a51f462217a3348"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf15b47000b32a5cfd77a1d975032d15c41bcbec3258736fbe5b6b5636f60cfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "76158e329a8a3749f7a2593178f11137e3ef5c3495f0ef3d7b8ed852708be7a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1963f2f427a216c4ac93fee76ffe1961455f95b8c848534d8982c20e09a6ff48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3ddbc76957fabc50e9076fcd609c6bb85242f9a886cfaaa7d1fdb69e4f815a1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "44e7c92b4fa816e04fe1bd98a17234e2a282c4dec3a02911d0e7aea2758f0fea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "7a430d90226b64b9fdb15d7e657ab791d877e6eb4ba2f555bfa85402bf7e1710"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "312c6692c09314f1447db45ff20b58ecc3855b8260f2b3706e94792bc30a59a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "01526b8d1f158495d9079189bba0d8fa76f5ae041647a6ea281bd8af3d24d724"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "3edef97ddef59e54572448d8a655a531740e6b9ebad357c266b16691a3e86f72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f33113f71eb1cb2c46ac4e5f6f13764e3c34f8526fd1a3274ff75d75d6a9e961"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "98aefaaabaec647784720e2076e4e72d2c0959410c6a92d0593aee7862229973"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "fffd29c013fa455f0816dc371509021bf3dd296547db9208d680c942353fead2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "de12c878d1bfcebbda2943e4d36abc6ea81bd48613a932c4e6599fbe51009342"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "5e77cbdd92a6bb43da73141f8161bd3033affe6df7a0e95d31dc44229646028f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "03ffaac28071f6a603699b8ba53eb6c9ab59d1703865be80eac658b37dd73407"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "86e17cfbeda923c534ab78e9ef7dca994460d5a3da294d7e0ad78c3fd54694ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "c1452e0f66f4d506e4633083647d2d48f724306454a26063c1b7302cd1aebe09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "df96562b66d42dbb0e36b1bfcb3914a22a2787ffb88d2234593f89afbf14bbd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "20ea4938089ca09691eddee6b51ebed79cf29ba5091d76c009a85dbcf79a3482"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "6b285cf23e49b39ada04fdf91b82f15c10f77b253fbc22f8d53e30b576fc08ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "baa65b2802417455247287b51296d5351e9ec21a3b68e5edf37ac838bf5ba5dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6001f4ca71894e42398a155aaecb681ecbe1471bf1a0038162b7c97eaf9d414f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "395fe57d51eeb05caf73bf656fb59d0a95673edbed9ed08a50f2388d7b282363"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "958ab4386e2baad928671ef1ceb44990aa8d7de736694dd1e437291b4c1cf10f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "245e4923ef49ec8f72923353ebeaf3819cfec14da27697bbeba5b0dcd2096f37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ce05e995bc24ad14e308c9670487a9dbb8f1b36e9ede310c3906559b69cf080"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c9bf063f368d449960f1bf4253296990f7efed663946c0d41b7fca232fcfd993"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ae6feaee5176cafd6d7a4b1ef23a4ab276f62070882977e8092f38d9419ece8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "86357637085f161c4a3bcc4f4ce5c8872bc6a2623cb88fa6857b2452fc27afd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f956880218fe0cb2f5c0e4dcee228e1ab098eb79e8516a398c24003afdf3ce47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "55a44e3c4ebcc4da151e795adf6e7e6fb508187915695358eda09491cc134149"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "d83e71324a541a5e449428628b08ee9ff3d9c345e535aa47ae4e5010a71ad692"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "6b6f9806a7a9bf29ee23319b1a56b73198bf5479cd51d6588556ccec19a79001"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "64e6972e7d0667e95347b3ec77831ffb8daa3125abf655e83dbc4fbe2307ddc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3d25e6e61be5d532ea624e5cbf338c31c5108e4caa0a353ed9be1cefbd818840"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "288b7fd257f39f4c4003b8e2b725c8e7bf81305c4deef0deaa760962f37a383a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "004380b373ed3b1a6dee2fc0357931d8b95f60a5405051bb4a458c20cf6d2e7c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c9adb20be2409a7887ff0c8ec674e25139cfe69339a2f41f9a8fad0d850e1881"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7c84acdcaf8b33cc6d080306315e231a4b368f064c19ae927b458e769a7d59e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5c4dfb23555feaaf890d8a5ce486f9f930b5eb58c1792966ab425f81c1f3529f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd478480f6c34a66258f8a95db81af9d79f0ccdea447ca0902afbfcc26cc97b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bb6c6538d53872b4def891e073213569713e3dfb2da358cf7d1444b0e68f0943"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "4aff95153f588ba61a100df4bef198ef45d1f1cfe22c7c4ab2088131023b7749"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "886a4659633988a3477763f2e5c14502bef391c3798b56323a8c631ea3de9767"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2b45459ee91427734e02d5fd067d0bb254a4a56ebd3acfbabe2ea82de4d76a14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "2f09d79d869535c44e914a6781a9545d0ca7516aab54844f480099952eec131a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3042c84635f2d21848bd3e5fec4cf827b2b284a1dce52dac220f9d8be2d37161"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "94f046e819cfeef9db61278580a65884a3274929e74665b541a17e02e03df9b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b77490ebf4c4ca77071bcd906ffa5c05391d54f5cadab910b9ac0f5f9c3cee44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d83db0dd9d384983119e00c41c5f2ae2df9901b85671df74bbc8dcf886e31452"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "83e28248adb0f786f92b8f67eb5f6fb1a911a92173980f3c1858daaaa7d98c77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "3c3e1edd6d20cce4707b6c87f7376b452e8c313195e9bcec032726bc5c235f30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "21f08cf860071fe53dbd398d38b074febd578405558bffc2a6f29b960ed2c449"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "adca635c7318ef4e3585e8b885ddc7e4e6f9aa962f8c33aa7d1b513bef952361"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a519dcf3cd845cb720129d66cfe672b94f9ada37e2bf3b28cf1ccbe64917c2e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c45e98dab8f7e8f76965b00dbd22192d1566d7a2e90c680ea9a711596404f79a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ac842d70c33d2d6c79c520bf9c643b2bc0268b414489e2b64c77e3bce466fe73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "868aaa29a22ceaf3d222cdf8351adb72f52e640ef633a692d0870f36a7b4d970"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "a4562231e55348242029b354c81a7b0b858548dc83416ea37c6b0d496cb92d07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f390f9312f9c8bdc60e1d954543210a10340fe5eb65554b1b53f144f6b05d596"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "5a88a36454e9f29fa85b18daa562b96a87044b0216c8988a3fa343abcf6912bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e90f36ca2c667ad9d6bfce69e637aa6f107e565fd0e7ba23b8e2f73af3d55e1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "6ecca10d29cca5dc5274db435cd14630ad79a2156ab8b8bd2f70fa9300fae2fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "3728444ca79320d2fde8a22b45dbc3aec9130d89acf09745315a05f56950f2ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "6a04b67d2cd58fe338c0e3f6095ca87a05a6b86a0d7ead7c839e6147273e5938"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c2fbeb8a06534ef03a405f8249586c6ec4eacba55b3592d35123437a8001c4b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4416f70045d3bd3a1fa4316f21ab20f97627d25dc56eaf96557acc47341687ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9a1aa67c502d252a68ef69a60aa5cfea566761cc9554d3b2734c982e8c22df37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ae6ebd884ef0bc81e0bce62f87a99ea6bc9705e707c0c5053a102f6472f900c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "38640276af4c9e4560be2983d162aa511c3463895cab33371c49c0d9f215fcff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4a23c72d69e0098418ed2d76f5d45542c76abadbe9295de152526aab6959c481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f1cc9ece73108be520e1a6e174b60cced74488e0adb6fb309bc3d4b53dac300f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4f9ff7da823ab0b7ab8af298ba399abf20c551f5b3653374888566aec1124c46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d7e84f805e73c387dc02e8c58d0aba861e00b31505039ea2f75b6e841bd3146b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1bf98b254e090cf12818681a856befca3f4e0acc9901c4308cbd41e400d87108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2e5c3f55a042c8d6088487fb6345ff54bd8114c996491a14e22f1f6da729fde6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bd5826df2abb809909768ed21c4a9f0caf196226cde9f6265c056bc2154a0af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9a3131355375a1fd0a15ae3a794ae4b833ae08050b352c26c35a69db472e9f9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "8fec20f3e34bab9254a3cfa3e6825ac63174ae95de8d759a7f549f6b85f8f66b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6276c87c20e74f52ee533cd3de4d155ed667fe8ddab1c3d462bc889a4181b24d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "218c0cec407d486155763caeed6e01875acd39f56821c98c4d51b6e1a9e324b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6bdc265e871ddea1d5ab4b283750db9fb5106ce200096105ae546bf73653e5a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1259438aaf03b2ea4ef1e5c4e475fed5b5aa26982846de546483fc5de3da05bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "a2f788c3cc32b27fbebdd785539ff442736c2a2daef9de08972cbd0dd5c44809"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ec215e0bf7c1e67ac8363ed865f438ff163d75554d6f58adf78ae90708de8d58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b67cae85eb29a6850276f815466620ddc27af33f418e07c5c1adb2adfcc333d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ce451b9fec6d6e02445b307baa7cc67d3736fa35b7c7ba7e8787455356061957"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c063377bdde178156a6cf9e5ee66d66b4e46b3b38462c69cc5149073311a6b2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "fa3903a739af30386600fe70526d57ea95b888d6ff75be715f63af1ff9f65d63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d2dafd5f13444c3e7fbfaa3a8ef4d3f0cd3d628d5127062ae12b66408c40a3ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "15080abd48811bf59173ff99b05b3063ff98e149efdf86d7bb0d9f52411f07a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b42825600d380c4177eafb56432031f9cae7072666ea2bb795d38c133af85f7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6fd16aea5ae20691191dbf6eab61382aea78e4ed6568320c7ec787a5f625ba47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "3e130155bbfb7dfc8a2dd75fdc9053de81861638db57e78ac74479112725bb43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5cf5cc9e554f06264b2293a7188914e936004c8e1fa051675e67a44dae846f52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "486a1482a21a650669bd449449c9b682f04d3abb2c4011ad31fb3adb1479187e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b9b0b5d528859313c1213fec88f44b7f945c38d20b48903f61b0d5ba2db9f331"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5a37a812b319a8e98161d7acda55e1e24f1fee0078ebcd829dac3f1daf99221e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1e496dec8e830de3d70a83fe0d77a508c9cb4b0d260f2d1f28652769e3adc495"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "803d191f5170528e42b6a7793c4f988d0bb363fa9dc58bbfb74ffb7bec6fcd4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ae41a4183f9e40af0310f716e51f37b08e5d5b5ab02b210bb940a9856bcaada"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "90a22aab4a2e5767212fb5d4d3966258cccfb0b8941780957c0328267434134b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "479423ee65b5f8822cc3c0f4941b8979de51006bdc763f7115c2dbfbd73f4181"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "967fd84df9638528dc91c4d37812b295d10be16f7ec5cfd01a1a6475fddf76d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8e38f84bccee68ae1433a317c1cc87039a9aa177d4319b27dc3b907b6802a25d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c9ba9ede20fdc5b19a5ef23c026e954977f25baff3f471b0d1b6d38ee85aa08f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3838ff0a9f9933f7b9857abdb4ef686f7610219cfa2967749e34f9836ea16139"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "33caaa2f2668af95b1b826e6aef6f059c9bd98e9e098f5dcdc2294f26718ed4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b6eef48a9925feba1b6e5ab83c902e4f476693de4c2e438afdc6e8a1db6847a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "bbe9b4a86ea0e39bb94f9acd20c1a88726d783d6b1cd387764bc7659567d16ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "9c7295ae447c270ae063d0c43ecb96fb642ecb669bfa196fb932d69871566c52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b17fc2014848df9a0019661db917cdb02f0c8090ed381f4a7cd4c07c0a2f078d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1bd384f528dcd572c0bc86a40150cadb633587752abb3eb5187acb2ed62e3f17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7d7f079c618b9c96242f4118f00904e08d58b49545c5bf099a1049befe0c5ffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0f8fc620d6c3a39dabd31e8c1ccfb00fa2fe947d29d9d05089d54a0e4367ffa1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c8d98428db207ad56539079c3c3e9f7f58994c210a3ab1d79913b0d2f553f61a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2b07148e4af01a87dd5512b4f2ab0329ef46a0607ac11dd17757f2b82ad51c15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "71b79bac30b854ea7415372e18ac3343bfd93bf915e1f68ba486c47ff40be56b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6c5068b2d28782c73528de1a597d191f5ad33eb22d1e5e40e5e8651e28c1eb33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "f01b152f42dea23d3cf46a8c6f3817a03414ab308192897aa1fa164e98774261"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "acf852a2a8616045ac160259f2496cccc1be16781aeaa4909fb70ba4ae42fb14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "000182cd205fe7f6c50b1559af24ce5cb354cd1ef8fa5484dbaab5e405a22a68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "cc9ab4dcdd661ecfb9cba408d4906bab570002cb31a64a92489399208f29327d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d959eabb50aca38d52c6ffcdb8f3a439f85787f45ce7dd9fa35b8f6319ccccbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cc9099bdf318b58a5c3b64f480ebb2936eba10df2796dcbb7ec09d397d0832a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a04c4bc737bdb35e579b97181e5120930696022ab85d9a0a47f055754d623673"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "731850d0fa4fd2b8f164b9ac337ce770b019ccfbb0432c4e52c34561197512a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "344bd10977e88e9e279ab3ebee1742e1bc5999b02cc214ce6570d71d410c935f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "671679bbbf1be35049a8f349e6e11e6a2f936f84411652d15f06545f225350f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "11e5793923f273d607b07a0168047a80862b0a7570e43edf3a444e5b1c341e48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9598c00cfa636bbb9e8dc38557d29a6d74dd0c744898cdb1932cb73c7a63e264"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "66fe5207667f55b102212281b4e6143870fea3761f4787a6734e2962c1c2c3ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2a906d12b95eb686cdebae60bcd2b6677c9a39b0cc684beba855159e8c89b4b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "24fc9ad5e9589174867c939612ed7e6bfe3dce899d231998c0736ef4c4130be4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f1959e4c14b3fce94daa2e60663602daa1e3d90d5e5ae0433b9c828540546646"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "79b0da6dc21a0091d9051267df7af9abe66aa0bc6eb3fadfd3679265fd0864ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "56fa4ef94d940f244594ff51d84eb332c37c790cbb24651d8a2082c38dfb698a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "cef4fc28b8e952493a84054023fd59d2bdf9727606b051ee778817b7e6f1def6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "842866f17619cc904f94ce3b00d2b9bf6476bd14a261c360d9ac90089d33103f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7b7fb7bec34a406738dd7d53f02210bc3385be0e0a328fe02365e8b707371546"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "7037c2c3ff393f12ac9ca0b5befa156f06baacf671eb8700918406048302f408"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c08f36b7c89c337c4eccc6515ba1928f90aacef6cb66f72d8ea1326ec54403e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cc0fa0c41b392e60648c1163456e80be483f34942471a19c5781b29545b38787"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "9863bef2f4bfe9f3297c9adfee82acfea6ddf2100ee4380871a4a3f3ac6175fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1a79c7e90e31bf8c8a0690ce1aa418af5e4a414f2f08424afa3318d4a13d4242"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "d46acae22406d8232091e22782c90c344bf4dbf62a1c6fc0167c7c53e5a95e0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "f0548917bd159af25ad89bd83a48d0bd7109e5379cc9c96406a2b7a951bf98d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "eb88a904237897562489eb48dd83f5f8cbf52dbc40c5e8f24ac1929af52d77f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "910470143109cfa41dee4239dc059df76ea56013f4a0ba9afadf1fdcd1d6a0e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fe31a29b66e5d5f086cf086210a6c9df9e461d01a8c17e37139280ef5eb93218"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f8f75c68a230e30ee7d623197ca831dd98529608a82e823625255a8cb6e7a995"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "3a8e0d7e65eb1ad17f776577ce38044af71efad72601d5e4f50b8ea2e5a7f219"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "cd3bf335f749202b056e7c27aa6db0eedc0bf35aedd8523f4724306103d8e6ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fb89c9d911d299d1d0885c425391bdffdc485a8715f5181174db0d3a58b4e520"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "32624696e44e583416295b7c4665ad9835f3a22daa6e0710f420078b103d3278"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "ee629f4a3faa5e22b986fcd3bcaaef62a7630002f5a04d6372802f775cce3aa5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "f7b0d4120376ac83f65e10621a42580563c4eea967d84da713e29dec4fb98cde"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "fac91a1c34718c810334d96adaee1d22e1ab3c757bffa0ebe4c0b65e9eea6b25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "72c513003529699d803623fe86e80c4981cc7bc051969e7c272a812698b7c768"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f6120b4c050f7c756bf7be9eef5a9c3ddec61a7109d3d966ba7cf7210ef92e9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "dba53dfb1e56d2af0e2d6bbee54782d9d3979d3042e0f8671b3a588df21e7119"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "6df5bfcfbdf90427abcb9e8f13f561fc55f08cff420a0477b566251b1c00eca3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "fe99cc8826b593b63bd21cb8bab7e0edd4ed50f2f4bfacaa8720033d3fb3cd6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "95f8710277c1655bfd3d7aa20ba889f777d671097000d3c4af5bea631f5bcf88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "c76171127ede972dcd78277a8f7909051bb9609927e61d93e85958c635eb503d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "15ae7c54e0ec8f1e72972f0d39aaefee479fbe1adea8be846e13aff778490272"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b6fd59ea99d593bf949677d70b3463fb72dc0e7e6c56abc34259513e07a485dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "ab392a85cbbde5a38d384b17d82ada2e8ae3cd2e244a43e09a8ab983afad7e7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "319c3f80dacf44ab27188e150b856d64d9217406b3850cc709b3a2fe8ade8a30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "180283ab1e08ce7c5128e5223947c8baab87efff7baa91846834381aca9ddbee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "db90cc96cd3049adeb1f39a26f84e36b78abfcb9da606cbc8252c40ad3e1b555"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "783f821c8e1d91ad6841dae99a42ff2ea11b8ccf41a091a3c4b43252c42787c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4c0bb6a97edb149fe9908d10dda34226256f6348112bb6e646c28e8372a96081"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6999c3945b25faeb518e358b434203e45416d1372b909927c67b53619ba78b05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4947d0252e39471b7692c408ccc95f6cf543a9f969d156e0fa9a69a625d88890"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3597b3f12e3b4db416adee4aaea46c4ba3268d80109ec8f308a78c6b69752b80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "99d054b2947fab7904f64907b66bcadc5eb7b7429c47fa7e93730976b3e1da59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c0610b0f058c27685f32621a73f5e23c9b95ad81e0723fb78cc66262537d1bdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "af65a0bf31bda517399819f17b85aec29055b0036480a04571adc66104a5cef6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cdccb36a4c1f57d5859f5fc46466fa7d3c734fdb20faa2c094b397ed0752e3f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a77d0fa62536fed8fbbd92affae0670f75fc1883205bf7b37839f10e5c1604d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ed6022fc40c11c91fe6cf0a82e40f6ad1bc173d3d42383e39d1fb5701ad33013"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a802766014396efb09b646d6bec11a675eab930bcc4ffd0abc7333f343ac9ad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "990208eb0a41975334a92f7d837a816fd81629f162d73a3cacead04b56b88126"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "25a4bea79d940869c65af2e78f1b7cc715c81f083ae424e97b69d4c6884c557a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1000ee5bd0f9f7cb6431a25053c4759b68004d962bfd0601163d17b24190dc46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "354e0a51fd4d45a6ed5d843f967cdfd9a62f57fd35b833de2e37d880e0f68e0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "77c5a9035b6e83e1f6c59b5466f40b28ad5a6e48a10cd1c6ab504c52b4a4fdba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "af2621d751e2c94dbdc7d22462343b8f221979f77c1c06d17e2d2a3132aa5a2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "21fccbfc13514cdfce7480b589b4d1da7a187aab01337d66e2d2ba9909041265"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6027ae0700275bac304732dfa4d48a8658a444c8b32cefaa15cbd97caa716434"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "0f5044f0bfdf7923d09a0d4fd348ed4916b828f66b820d23e4b4dbafbf951272"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "679c4e47f17ebcaeb1073d0576e5f8f3ed658e91015ae128781ffedad639ac6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "475b831336b7123b64bbea9cef05b2d51594b6bdad2a933e8d5b3a33acac401f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "119e2e7ee2c2c423066eef112367988b691fefe35771157a777e4cead58b30c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "552525d06606b513464242c2369c5122a900c0dfb5479a88c9cee669dfffb8df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0c4fe67e3a2376fd00e3c882a83c83e09f34d0587a642fc919ff6d51eea8717c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "16f0546067830150119e35ca4f439b0e4788e237d79db009e814d96c8bcb55b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "34809a1a79cb8d6bbd346bcbe238eb54ed00bb2a62fac48f164a1243dc42f87e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "cf9a146835e1a40f632c8332a75b311f0948d8ae4d4232128df4e7eaf82ed09a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5e3a0b50a3f12ba3670bf1b03d0bfec95f6509c6145a0506d0b35b1a2d979a41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5affb6a10fb4537113e4448b7cca57557d1c67208fab4dd26fafa19f83de75a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "cd60954b24b6e0bba4fd4b29aa8ded5204d45f129925e53d3b9eb409adf76e02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "25b945cc9e4cbc493a9e92113db939a24e5429f50c5d1343647ce655e57ce571"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "495a0fd5e7fd55cff83956a5eae18e70010a1ada30860c71cd18d8cf8a163037"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7ab3abde357a8946b4de1a3ef5f0a8b191e31e0b81563096de636419dd05ff8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b2f79fe40050501101675c924342fd4264d960f76e34564372fc4af2cfc6cd1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e353bb11f652fb83a5cb823b12b53f6a0b44c3be0cf78dace7f88c65da7c6e55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ddd62731f9640eb309c6863c23a3dd00c698561190b59ad8c9d8d024bda49a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "cbddc62cef7b41b95e143b0151c185c01ec24f0e6186a3f01d8fd6a329c5755a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "af2e5424db6e2dff412cfcb940f2338c89e61ba081bb147aea4a5b25071d7544"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5eb34c2b7f7c4c6500c752d033f3837f59725ba78cea164618bc3c1c46ef7db3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "26b2274e9c4c79b701f03fa2be9d675f2d48400fe549fa1f1adc3fb9963675d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c50f4dc1fa23512b59b59cca288ab050eecbde73ba3618b64d9e4d1aa72ad6da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37495d0323fb855a8956dde4bd74227ec9420c13deee8dd9bd266a1d96c529ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3504ada3ed7acaf9453f7919d4325dda9d63d8fcc474f37882d0928828aec84a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "03004440710429c36b1ce5436ada0df93a25127cc4e1b85ef2533b847666c402"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "70bd2a5436c00ba0a316b7ec09d37f0b7daced63bbb277d26191522c246a9f74"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "7b4880063c32d6f6d637a680bde54a523dda078f8204317999fd61f7c1b49177"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "b717791c919ec3e9a5b4704cfef5d608849d28c53f216c6229c91dee20467e0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d0229206fc919f01400b281eb423cdf0f6472010fe8eb02d29fc47d3cab4c32f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bb81a50775699fa596fcc3582e4ab6c651fbe0c4fe52b222f686fee6a1929785"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7543df7e51fbe19b87d5d318694593706b10b3d36ed39fbafe80fc744c1426f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d50b648f983c2c0c2ed751143a9aa4e80e5aa292f1ca9b742718a78f2cb6e8f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1855766a2fff92aa739eb79f3821cd95b70ae798cdb9dc2e6d518c2a1e920dce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "226c6db3e9e762cd7990d46a47a25b915e8f71df5c2031c429afee83affedffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "dfb1ca6bf3f942074ccfd08807345b28f855bdab0cddc02b6fd8bc05f9f8de2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3a271b95c841e60f69854a37fc8ace9c12f3e239b11aa37db1096331175bf748"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2cf54daf6094044f87ad1854011ee49225fb8b863a8d75cc987a1449dc39521e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "bf83c8d953d571175eb857766f1d55d76e9d5b180e88c9f80c908e85187b32c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d2581978f9cc5eae715a86427bc7d174e79edb1ae87f1b5ab6a7f2a0d2bb5b1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9080b2d2d4e00aec7d1d03108edb8cb7fc72358b584a58b649d8b516654bfde1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1a7377308a68046725fd514a1574790f2239ebb24de5572bca14bf14c32c486b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "12cb04ea1205a0e64ccd2c6e0db12c1a78a744fdaf1e5577bc5b7d9d19e3eae3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "d7a6aaf8d6b15e437cdf8ea2444234a69e9642367c01d7d885227c5121de97ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "e8ca6ddda093d06fbabd963a272a121b75ffe4c260767f7ecb578101696e907c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "105366702a1cd03200a21260b5e51ca5e14313c09babd5cddfc28182b011710d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "c8c0b3e248de9a6e3ede43185ae3d9f49b8a7f3c63963bfdfdc7d53923b15c28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "eb39bbdd51e2cd4df54a2a756a9e9fca9935104118b427a94bdd37b65decff11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "a109b6a6a52e3e94465b5bfc345b7ae4d519f01820998d208435987551df4ca6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "ae0df58343bc0253a1b5fa5e35873da21796ac7417782156b7cfc0f1477f3bb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "87d31c3bf1f1e9413e0f6cb02e502d8d4a7b81a884cd3040bdccd644812ba668"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fd72529fb396a385bb8633406678468c26056ee55a7256504532fbbb0af465d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "91556c6801ad8b8e5cab8d41a3c7ad39b12e5cadb7fa6fc0798cca35d581440b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "86fe258d11d9eccd5ece051b9d64f8445da7b562d2a2781aa1e261a61f71cafc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6d54f71dc67e005eca4bf81a0f51de9abc5481b8a5395bd77f752313e0b5a55b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "349766efb40df3d7fdd8739d81cb615dcb19bdc2359fa168268ed3ce1cd31b09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0cf05d4fa3ce78030da80572e3973269e341de171510fde17174a9bfada4f7eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8bba587c46f50554b76897865b48dc97cedcc4ee63072a13201e6f74ebbea44b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1bad0930b06e66cda02db25335fadc3d7d7e328d09830ddbe991f9d40a713d10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b72cdec59f7f544a222eb88595e8945e8afb028184e48ec76a4d835a348133ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "451773d36d1142463e24a22bbe256ec16c26d18b510e047c98b34526bdb356f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "809598f020cf49ee5898f9e5bcb6de7df63d0e27e0979c5f8b588bcad5b69c83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "db8c5e5122602da9e2b7bc414aaff2ebcd2cb1b5c91015db8758771c3c3f77dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "da116c28862f149aa9fafeef350422876271ed715ddb66c38ff211ea1b238e91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4465c19385b3662486aa846fb5a2e4cb9e956128a2695aae45db13a7b24fe35b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "55ede767d3980f4ba3be17fe4428d1efb9543ee2e60c93000a8f25fe24d15d50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "35fa36bf9abeebb5253cc2402e069437b67f1bc0835177102657e012eb02ed61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0bf6fa6e316e0441a272a6ff49e6cbcfb792eb5f22d0998196580f7d15cccd20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "fc55067d9bd2f1459475346279836ae480d1dfea53a5c9066b743d2beab4d9d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5ce8f9a1fcdda0eb60ad8af7842cea5982de8036a1e2c062d696269ed4c494d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a744649583696dd26f0e5f273b3f92b60eac7e19544b187a56bad15f8296835e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f9de7ac9056e484936ffca04b15b14d0906c0baa8c1f2298883686328e5722e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "6292893b742157c855a5206060550b58b9f80a4a21141dae72798560c3cf230f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "04579d05a069ffba0e745bddd1ad049ea9f06d560af7f96ed8c6093fe59a44f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5cea38cda2b1633ea06d5b6bd66e587d3da9a1805436ad01f3c5aff21936c21e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "eb1dca3297949129b08eab3ec891a2b18f10ae6f48241a86d6ef9353add14c99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a0b73640214c3c1659461c8793782f035b14c26f6078cf4a175ec40d7869285d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "d54215da901850c8340a0c13c73d401b586ced6552c81d12d915a9ceeb5a2560"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "599632452110201a104d9bc9b6ac982d585879e0fdfabd0c5505531029fef4d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "cdd15113550984d574ee01bc769c393ca53847c17587d5f3cd72a28ddc8c575c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9d815bddb37ddb5632b5a80c23a54896b3e790a3b78d20c9c3e75f4cc3842ca4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d7df8437ac72c65a8357c955aa7775742638d0ea8e25801d3f1fcbe101bc2eec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "eeaba6fea525cb70ab767b5999f84c348a5c4c7a0853a0d07d6b881c04fb78f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "4256f5f5c27100ae2465dbae72b2835aebdaf180342373a944455b36f2699508"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "089401654a02c6b3cd5bca1f67944775af74182443dbecdd0876b6cc8436f932"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "d60dca9b007cd952b311f5acc85c173653a8867d5b0fe7fe8c9adaf297bf5646"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "dc04ce615c45df36f8f435002fbe4cd7c22844731e18d105364711ff84bac851"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "41dbf77542b1b645096fb7d666ba6a760e170b26971e823bc3673ea8c5389011"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "6af2cdce640b9b39244340ddcee7a9620164308cb2435a91643aa8858ed634e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "139e1dc1fc639c84325c1bc223568d230747b07fad4a7243ae3b410e6131453f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "87b4115ca460a4877422561b7334d204b05d42ca9f4111415c6918b4698da30e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7cd8b316d48c662f9120856f9a3c68b2e70bfcc3cb0bd1aab7fad08ad03b32e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1004b95f834cbac4b90dd2322771e8dc8a85817b3b9bd5e3ef06fd0baabbc848"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9417b55e45473a4708129e2aaf4f9d1066c9ffff97b46966c3c7f5b386b7f0a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b85f6eae8fcbfab510168181965b6edcf738efdddc7f8ab653d9681fb8eeda69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6c2394b915e306a35c98cacc75dbbb5b9d61aa8869488905eae350c74e6a8b7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b11621b6012fa2f3651e459b9662c8cdc69a33657987d061c0bd18ee3e291858"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "76373bb94d42269c1df5f8e9a3c2f70eed993231a7699969b3aa5563b58bd888"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4398f42be31868152f36e794e4e71601bd2b0f7a4b9a0d358cd4c41d6b65ab35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b8114619279a5fd3c3c6a0269a2ca6e4e2d49ac6023367aba31745a9add26d81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2237c70e3775a24cdfe94605ae939b193c7ab7b1dcaf1f90f4333ccbd6b2f57f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d340f3839ad230ce62aec254fb293eef12935ce0becfc2a7fede1ec6247648cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ef231ad3009eaededdb597101a82ba085adb7f5563699b910866b670bc7939db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "fc850fc2f25df6d367e2b30af359f2801f541410550f87e72f3ee3fbdfc01b1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a206dad34a5a87975bf1ba725cb39b452306c58655ba96afa96fcd9ecaaa577c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2f4b88520914deb3d84f53f7406d1dbbcb915be4eed5a57909b341c219916f6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1641a25deb0a4c0dd3b079a4db029b122d42a158284d7d344d6a677609e3524d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1955ec43c4160460bc251dd1b5716bfd4a1b4429e954e77e29a467704ce9d84f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "46c6ead91ba4ee7544b901edf80f5b359749988d77e34a3ad029fd0245eef328"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "99595cf95edebfff07fa204b03052229e2bf5a8df355c61ac6ff8a16e390b44b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "29c04dbca7f23c79190e86302b7f65318026c85c9cfa2811d9b2754d27c1b180"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "57708585ba641bb3064aeb5685cd04405db2594c0f421d70fc2e82566ece62ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "2b31b39913bbfcf7cca3935a79aa34a5692a7f35b2fd50ff8f52f8b07ef5e21d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "025f8ed0c686be03110aa7b725006379a11ff6ee88a81e98aa68ff461f244845"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "c0af1f147413f7fde74b7ed0fa6a499eca678eb531cc22b36137dfb8c9dad92f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "45eaa77275984e4422bdf96040a1d23f3a1392ec659795175fc75b3091972b83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "5914f4645d5029a1f5cd5d43b024a4bb19a2deb6ac1e674c36d8fb820e90814c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "5c9b5534278ee0d1924a640395c1ca1125e4a737a65229fab65ccfa106ff6d72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "51f6ce6d9d19f534c9f50e861b60d1c4576d677f8c7a5792a694277a4edf315e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "34e90f0039a257f9befdfbd4a3400dfdb7bd19e140028edb3f44474a93e42db3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9a196a35ff9de397dad1530143bcab8371ff1dbcdc26596e154ef618a2382c88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "07783c74e2e73166ec602be18a9ae930dc7838d1b86003a704aa9085715fb503"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "49eaecc0214f5bef9e28fed2bb81c321fc6c5ca46dcbbc6b24f903ef82242a00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "70436793c1a533109a4c86e1d6da186343a7743ace0801829e7183766edbc638"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "7ae5aef8921696387c0cf187bf5e2570e1a0db69a581c0f4802ca9cc2e8c287c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bd291fc5a330d32cf0257cec97f7bd8df1385fc3bc8760e5384203990be08f91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2fdc498324fa372bee39e206377a64fdc1c09d23003e0a2f8277abf525e72153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1177b2eb3331ccf7460c354d421065e9ad5599b665b2a0d4e1a49fc3ce794e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8fa5b6b34722dc73723979178e615f9be8012270dfcaaa9c31ed3c755d0c87e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f2f29ef42db036da9a441421803466d73b3fa020aa5f6017faa8ebf6c1652c4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "f16546495c774274a5a43752e7cc2f2df7446df8b16756ced502ad301e28732a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b04491061d7f6dab3132c1b8eb471bdc74fab7d44e32b3703bc18d5a63190004"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "af5abc35c9eba9b47ad2ebf173453a8a386076bcab8696c5d7abfe3c5374af19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "780b28ee83c86c7e2ce7d519e15ab4edfe2664f5226ed1d0597d2ab9dff8524e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0f353b6c4df9a6f9e9429cfb988ea960fd3266f50e61261206a4a231fe3baef2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b136dd05344f1cb9cba10c41a365c3a345e62260633f06c421390ef2d2ea7b2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "535394865b631d91ec07e0b6fb7a4eb979e57da06b6ebaf14ee5547af542bb7c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "5af87e9d680c83933d2e160b686ee8c20e59a4d27d44724c87b878b56784d2e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c30b00fcfc19aec1550f9affdc60941501dc8183ba205d955088192110164f9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "7595fc0f4f3cd5deaa4656ec42a8f40c6464806af690fe83a8c6fc397da4c341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6b030f99f6b59fa582be35042845ef629e4ca53ef1d5a48bc6ed2f80fb2b0610"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "f8dfbd99684a41cd70f5b0e05d3219f0dcec9483a303185a0ab99e346164c2a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a4d8d76cf50feed08a1d46dbaef19bc07306cbdb265dd16732c52b615d39e656"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "486f3d363f3cd2f38a8b77b2beb48772a5b13a173703f5bedf025afce2d145ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b4d77f4b2680af7163ea1cb0c5a4794c790d1352013d7dbc6af1c7a739e1ec40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "47c8869d6822b13215294c3fac08a4a00b8ff6f71cef9d9f2ddb76a70abbe92a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b08443d9546b583553f1a4f1bbf949686b60925725e1c4a97d303bb74574a944"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9bc9f1eb8ce6b0dbda711ee592c4e89994384e7ebbef72a8cd7f50f1a1029133"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a1249e8cd856ad119abc5477c5b629681d180be71df394a1b121f6730cc479ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "e114d2b9c8865579afccd15e29211d7c4082a32710c0cab4007cab0d0cbe1497"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "6ebd3b9da98cdf4c8da5df05d9bb142e1aecd31e71a02bda048bd19a3297d4aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5af93f8b438346532127daf91e66025b1ef559d104e1ac5f9127b14c0ac26261"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2bf09d89d9aa77031b464f2a197202176823ffa5691683172c2c400996b7576c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a202d2fc4a8385960ef5c699c9684c11038831656f3420818ba4ae7227dc0ec7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "9fbdd9abd5cf7a99dd835b47e2a50583b5126907837962e6497f7986a14de6d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "bf2fb480bb19c7f4652758e59aa3528237bec05c984737a4b2cd27bcbb5ca43a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "eed4e81532b5f9bcd9d3477c26cc5d1768cc23caaf32a41b2995001d96035375"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "7a5ba42336f009cd219a7ecdf5facb1e08470bb2c8f9cdfff8058cc99754b524"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2cd9128007e7d556f0811968023a216843433f86d70883367361c739dd6e50ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "fe7fca32abc642fd69f74ad50784cb45e30c64c644baf0db9af6ae917f701b19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "9da0a4904a71935d6b624b6e049b0f6ffaeb050aace692a80dd3e0656c793f8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3be7b25ef9179f28e225ddd4ea5a4282c9e7729dc795b2c577f7bc60c8853e89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "77f7591e5ac574a47f6a961c1b6303fa02a454f4957b066d4d140eab54fb5062"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fec3fc11428734bfc86f8704225b9bcae964fd374a65dec92331e81b6a4b1a2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8db1071d082829300ee8c4974d8bfe2d16bc0ccd0a40967445e0ed5371c11710"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6b6ca2eb265b1241eefbfb6ae3e9022606f99cd1fb18235a1ee4a6a32a57869b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7feb8df77a83c09ef2bb35026dc8dcd193a41d259f6f9132f552dd52aa3bddc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "995eba586268ea1c7d763d84593ad0c2a40bebf01926191e3493dc5142220ec4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "d3ce086729f126237ad5ca8c089ffe47c4ed54437925712c07d9530afed27daa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "44e718bbb180e55f5d6675236d0a7f9aa6198aaf82503302e769e3fb9058ef5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dbe93404cd44f27335a694a0fb735ebb58dbaab92e4891a081d0ec1c77164c7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "778e8f73f4662e7bfe6008d4a587dd909ea30cd3fe5fc58a06fc5f77efc1035f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "89eb09671d50267e8a3ac91e4a8ad0e26f8bd4901dd5a9f2679b51231ac11b45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c3640b6eebaf348a0a34efc84be4c6b0cd1168a865d33ca1de1587f3b480c512"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "30420e93729ffed8e08732d1db773839a5e1687f464346a21c88dfb6fac609d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c94399b535a69e2890e68c2e0905f1a339b04168ce751d776251154120c1cfde"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "457bbe4919242232343f56a30afd83770c1e36cb83072e7cc7afef06509627c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f062157c37b0ddcefedd761e5a7b011546f359c5b8c5977216d1fc41c762dd1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "dd0261715f5093b1d17d1e2871b1e49c8c0503daacef91f5333abeb3feb584c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "96ef9f8d6cde14f3c72faca0ee768cbd926b427beb041bea3c32ddedc8e87500"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "727bd57a04c5b6d6dff0fb5c5b4f4a8ce783404c1bd8517fa00e3204e2f97e56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "b735b150347aafff12b81f456bcd8e036e9a1c82c77dd7dfff4c9ff727e33cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b7ae594d3ed3ec9ffc4546516b63ebca633c1093315f3936cd4159d264a8a846"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ad0d52c60921147cdf069d5249d59b60aa5ada9fb74eed5c0c48781b4a9cafcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "182db23b0fbe9ee542c49e56880e7e57d03c30b269e18c3d963730dd514e65a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "652cb71098e755b4607bd44a2447e948ffee37c5ffe6472ca5bb8cfa1fc18bd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6bef5089abee8668e7d310c9bdacbc578a45affad6bb69fb29d92f5bc99855c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "c8240c0ffd8b67eab5af0c1c93ecb48e2234d4feec274da75dd925134ba26c8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f60c9cbb880d7dbcfeb436b3485297408dae87a0a2d0a71dde3a063c9f72f395"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "04f8c740875b512537e7cb5266f58ba3f4472683cf35ec7547728001172f28ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9935e2c7a175c5acd427d69197296aca405d69caf9b706e2af049e2f799bbb99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5c4fca51ad4f3dda7e207e88e8949f8bb8132aedb853e35bbcb61a23b05c4876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "eec10f431301ed6d68e517a4b7c8501613b6554f4ea6002d4bfd395d60cc9eb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "86ec86feec79290480ec5df4e180c1e6eb860edbe81d1ec0f94d0b540eca67bd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "9ccf94747545de79b4257fd671e92c91475bd109df41913b1d3e7dfac079b886"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "182641eb20ac5c2e613786942d76428c95d113b342a27e6473c711cacbb06df9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "4f34c9518989e1d2385e14f53efcdcf7cf738e10c0b214ef08c8d475268916a8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "1fb715154df7a62a4320c9080df716de2174e8d3f92096f6b1aeb37f0eae1bda"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2cb883d211fccbd30a724fec581f5332ee3d56443d8abdb0da9273b3b8c0fb53"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "9fb46fd2ce300717c258ec140278d25138ee5a6d998bcf0a18a2a76ec981a48f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "bcb4db62ab6719b8186e88b0abeec89a400cc5f2c6edc4366040abf72f113021"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "9c8dcd91cc8f5818b9e523e37d4b0ba9278883f72895d810003f0568307d0d0d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8c52805fd89aa6ea6da96fde7a360ae7dc3497ce2c767ced3829861b38dbcc20"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "bdfeffa725459f11363966eb27d70069a8449e76bee11ab1f004349263b252dc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3b7e351e99b5cc2896efeaf7352bf011c44350ed3e6f4e7dba4aa1a84d6513bc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2169d53ab8b87d6e053637b9a766e13da6b4b14e1da7e7321a05bc8200d1403a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c7b8f8a8d89366aa9ba1f9751c997a57d524831f21ada80fd3e6ee12cba5e484"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4ebcf2d6b743508423a1c19c52ec502a11ceaf2b997483d4cb43f4dad81fdd7d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "29c144d99228fdbcadd86dfa0120b0bd1e0e1875a74b9c4a235771e323cc737c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "230c543154402823509c20cbfd456b1cecf05914923cbf4ad29e32dfdee7ccbd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "87fa015dcfa945308865c304dc1bbd537e69d68b929940b883981e1c35b8af0f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0774f283f9a89fa04df5b57e561460ab108963d82c3a3551cfd810d43b7b7892"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "4f55d799fdd362a87f26b2dd111d611980270ea4984956d45816cede5f43439f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bd8268aae6a85789cab10d9fc4f2e0bc4bbcdee08e5cbfc188c3a0764127961c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a7f6b45cc48f4244fd67b28db27edfd4685ca912174f3677d0f3cef7e6a93b80"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7dc3ac9dae1e073709e299e88e47cafe9478ad618930785045fa0acf138681e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f38b469bfccfda22e8f335068ce9c7a7b55b8ee165492e82377589b74ed99bfb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "8ab1c905671bf3ab579e937ec58e6e7f2387eb649136ea8c6deccaa04fb00ad8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "b6638a471f09fb65c8f34bac7aa010f80c141e69d027767df809533c34c848f4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "316a46a33b0c9fbce73cf34e0c60bdb04784384f47d9d84262011ed2c25c277b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4944285f0a277fb90eaf1899e62dd145e6a89f95fb9059c4eadcc4c4b90296e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3288a4266f6961d2ef6308783a784436a3231cb4470318339b9c7fd097fecdb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f41b6fa8ec1f56da8371bd92b957797e27892193d243217f4288e8292941ff5d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1a9e34e44acae43d3b0494a98abfa6f257220d16c46f1002f5885f9c7fef6002"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "528847cb7ec54aa25fab979c8a3b872590eb1ccbe0ee039240d67c5311d8f4c9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "0f39d9ebdafae344cde326cbef7c977a37c8a8cac0836673235ed6fc919cc284"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a53d1428ab650e4a4a0e2147c90093f2cd92f0ff79696ccd85d99c943ab0e68a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "55137cf5b1dc74f72714070b500a90d089ffc9c9758bc7bfe23e616447e25aa0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0f7cf161327150d48bb1c336ee4be582eb361c990018723a4f628cc66b284ecc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "460fd548b29cb47b45a2c696ead62cc2b484fee6f8a0cc0d22e64dbf389dc74d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9f38e033d2ce8f76a1f7581150eaa91e13cbf4c37fb3dd20de0daf7f6ea9f118"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "96720e8665a8dbf64a2e3e069deb6d82c0bf40d247fe9256971678065cae27df"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ca5e80b5d5ab038a5b2ca69d96ff640dc413da14ed45f60d6445abef341d0d1d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "42acbbe845e33f78c1e40d920b97bfb48d448175e3d0840d7d094c9c61e39574"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "821ad1981854714ae754c8509ea71ed7b01515c61c0b31de7c1e19f4afd76598"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a6cf5e55448f57e63d2746546b72f8ef2c358c636e88d43856c948f2fd8e2d47"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "acb2e8ae4bcea0b04a384fb57a0cc73c2802af9c9ee20b7f3a0c545771d7d048"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "73b650a49732e42fb6d83526194bb16a65a0d2be6d5030575cfa2658dd3b3136"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "70f3f75cab63dd824ff6d6bf895457d1166d8f7511518def56f48a257157d5ab"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5d26a17bbdbca8a957dbcea7f9023207e1401a33191b9db2e57f29c67c7f61d2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "be549af1f43cfb6b406a72992e9e5dbb60142bf0a55cda43d3b8adb62573a00f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f2948cc24a4954678eece9c94a9083540beaab586568bdbe8e10c48b06dbbc8f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ead78ceeec575730613a1f4ed56c40c8562a73e4cf5daff2e92badaeeb3ae6f9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "4c0a4a418485d33989f1b6ec5cf6d76b70326ca09509a5f7ee89248d45f6d5f1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "801d91bac1cf7e0ef3f1cf725eb714f2f2a5ed473aa30b987cd315d7251bf096"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6f115926c2e4dfbaec8ee88816a9bcec7b45a3d5685cd12e4cb2a39ccfb0a98f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d0f9601918d7b962d46e83b295596582d9bd33cf0d1f9bb7ae6ea34a7c6fada9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6f01c731abd80ba83f139b8e27b798fd9f2805ad3572369585d9937dd046d7b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "32e0e4d173f43bb1cfa0073f59ab35c6b606e1c419bed50ed3372111451ad37a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "b36225dda9fe5c4cbf30a335674cc71fbee86baa3f5012e33ba760db9a3b899b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "bb416dca27d7e494114971e48c3ae4282f0043ae526e22f28a316807ced8579c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "6df75b1cb2dab6f0113ef8e6eda7369ccd139fab8f109036373aeeddab8b1ff3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "50909b1d8e5646ebd23ae7af794c4d44fc4f22973f36ec0c7d61869231adcb98"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3fbbe94abf9b7b7f63c26886408595863da12330774167e1b55faa08d7f9cc7e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4a03907230f5fa458439e610e5be58ee92ff10fe87b40d1230c18d8c9e22a05d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e615c26f7aaeb165b8e8c648fdc4314a8a8e5d759185aad0fa31956f967467e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d29c4c4182aaf7a290c7f7d0c8ac970a067d81a3695ca7a85cf9a72e566b974e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "eb5213d2e580a3119177a4e053ce5784179e7bcfbc6b33d203a8a888533964ef"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "677f4c3c1437b284eb34b9845d588a3c11a8b83eafedb9338a4db99b2c503739"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "2aa945056fec691b9b5903f22213ca344ddc4ef8b9c2bc248da34f64cd1abc6b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "75723738dd885b2fb9977c3facc00bbf62d43aa0d639751d61e032cbd894f147"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "257d22762003dd5123c301dd0cc81335739c13d43f121d720a3179dcd7850ed5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1363b3a9aec1bf998adbeba25d199094fd72df0ca373012aeaaa11911ee11272"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fe4edfa081b5f3a3923bdc6bbeee1fb6f84abb88a76b83cda5cb4eade4fb16c1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "61b533b424fc566cc6217ea277af878866151369503ef3278d9fb31c2d97e53d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b01490134b8237b2527b67eb0675e1cb3449b7a2b9bcea9e3b8368eda28b11fa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "b27741daf4b3dc2c518cd321b924f2c79ccaf4a0089ea7236a6ab789dc2d7f4b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "69ac21eaa149195769d6a6cf3540759209cd24fd9e29a0cad85d03ea90d3f249"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "6e890bc105cbf707132d3231951f226bd9d31526af39ec884e310e0215dfd5a4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "c8c16dd271d5934add27639892307646bb54e3ee9caac0b9b01b8edf84b7f80d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "0717e1a3869d1df32505f9aa54dc03b2e69aa283a1fbc80aa1f3de29ba05aa7c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "72db408a6840bf99f11594e8d0f8e69409a4f156dfc61bb993c8dcc550048e41"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "455dd22753f9e41802f827a1088d84e0ae9f950ede3ac72cf8fb7321e025ba21"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "dc682a60e0aa9c57b348e60238acf0639f718c73fcf46a43e4831d8f022a78cb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c5ae1a1be4491c83605392c0bede321955de62e02c4b2a9a96d59601bc3ce23d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "37da47fe73dc7fb45b4bfb41e5272bbc84331404f29427fa151c12d6753f1748"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e2423ff169f3bd1d7fa142347371c269e837ef7c23e3b17ab959e5e86cb83539"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0bfb53cd894112eafc6be47dd328ac9da15b177a9f69b81229c575800f965047"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9162be9d1c69258a04b68084bfac6440c945f5b155fdfba25dee6da9e410e8ec"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "fa537fec3f6eccb3340802e360707cf80cee0e75aa9b0ad9b30ca317601279b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b8d3a946aa2da9482cb69daf02614db34071f8091c5b6888cb9edc414ba971e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "ddf823782723bdb9a633b18f0d9c0f7053e3122924230c0194240494b80a3b84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "3466310d0a52f42e39136455561f0b80ece163f8aa473b053982e2ef5889b29d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1375010cdbff780835889587802d34c77f96a3e766d7a75754212335ce5c22be"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e7045b374d11d7a61bd34bf3e59bc5d4658bb955748ad59a50bdb0ebb76c628b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "4a8af28a5af81b4d89de7e39f5402f7499382ad8e9d8eb11512b00bc174d9684"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4008406992419e8da952d25712da67eb5ac7d337edf7fa333e1a85509cf3946d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "e1db2ee1ea569c82ba5bc6074c7ed16740365a8c5f143b45b1fcf56d95cea6d9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c4905ed7587be0042a35ab39c9bafa1e491d19130be168dc56f66c3d3e9ca7b3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "9e811866ea52b3b85faff0625849b153f94996c248e334e8e7fc3695104e1466"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "5f5f481e2f2fbddb40dee009f4fb452d9d52c9789c4a7a68bc6e6c680e7ff118"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "d63dd3790eb52a0a7d66f10490ffeac8e334802128e1454cb052ffa22f4c5a56"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c6dd40a560308b28c5c309c448f5e5ae3c36912d68e2277475e884a3560d1493"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2e968ef8ca052ca8927c48cf06621f07f48bebada7df450b71451635de89584f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "374c8eacc50c427f4c49869b57e1a3127c10545d5f03b2a05222d257f4563124"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "b6c55771e992dd9a2e8118aacb148d0c4b2a23f1f5b590084865263fb9095a77"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d418c0d6c339aa2cd900aa4f137ada5e4398ea23ffa40fb872c08b513a9a84b7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "83440b26591540db553ee8239c41de66c6cc316b18cbdc2be83e9e1a124dfc13"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "cd04010643d137757e0b3e3d5539d8bc7a8177e8635e559cc048184766931533"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "4e044c7f75e710c8789758e4ec5af1f6e4e5de5f10dbf47da890615f7f22b121"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1e4ff83a06eee63027374707cf29d6029a38951ceb9e99ce119b2e97ab0e0d0e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "82b2dedc92a260b7138e37d97a070efb94da1d2719b8feebf958acd84c8e3b8e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "41eb1f61966e68858f7f810502d42c30d6cf2fb95f29f271821d250f55bae374"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "3ec454cc65391f255e6dce530fbffee768e753e865b073728235edf63210a3a0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1f3bf63cc656975642c2c208feceafe1670107da6c64f584dd3b96d0a1d9c9b8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "285002b3ad35ee6e9ce0e93ddff1345ac30d2087a15eb64aaa518b5770249847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b4ce37da664e1c8d09ccd8b41d65e9380392e9f0368757c492759792c4313fee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "dae405ecfde1a7086ce0bb483a1ad55520cd4bc3b3e2e39e9d5998e24a16b99c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4a0a046ee21a50f35ceefa7ae2e6b4998ad6903fe80f51e4f5b4db085589ad58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "361c8062b46b59fa022bccf62db501b5c2329bd22fed4054de1914965390c137"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "de476a4e7ec25ac4c97c215c618e086bf2fb6066725cff4a065df4207b7dc649"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c33c389e8dfa4426547850021957807d567b54d373961dbcdad577d9e55bdfb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "7dab3be35d8e74d50f5cc767b6c9ed824dff3d4fd9562356e974fb6f97756f0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e6f976c884ed4b763cd8dcb00354da177d2c18be7600e1581ef9b108e26d29e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "413c12f1020589ec3421625a92ef7f48e42f01792bbd05669ab114f0e4409c47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b90a2492416e941da09f190e2ee48f6a4cb33371b4e2538dba6ebff48d614c64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "5245a3bf26fd1515f05ce53e0497a082bb6f3ead23141d215a50d68adf39a12a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef95050542686523e0da27d52c19d45b81be68137b91f06701e632f272e65712"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "31c8832d1023f66aadad802a5372adfd4538ca26aed384d373abcb8a8a8393f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1c6742d8ccce5ef2d81eabbeb8606952512b7dffd8c9a045157de032f798c25f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d581c5223529ec7f4c110eca9da3cc749e038a33ac5b6baaa1ade60aa1dc504b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "411b530906f9649de9183cbcf91de6d4bc91b19949fed85e3c78013dbea28e7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "68477883f806efbe60c1ef0484281d3c3d757d50ddea48be6dea8771e6cedb5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d96ebdbb6ce00423cf6ce0cfa5d88f3cb8b136337fc4f6a373697d9880de54a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "08fc1866315e1c6e2079fdd338254a20ba33aab55a42ca4b9829a6913fcc75af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cae15477dd94082cd6ee270ec5e2ec5f4f82c770337ab2b50f1299468901473c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "650ca39f07b7191377236772177c6dfbd44d958c3b9a17f26181217a9543121b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7fe3aebfc7b8281ddbce97a73ce68501815772d1c1e283d712d88a49133cfffb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b2b0aef79fb33d27af6fca313f2d6c91f38c378256d02d2bcbd30f670173b341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3397cc45eceb71d154e18a61c877416516568476cff80257b629f119d01e6226"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "9139bf19238378c04d5d0e0230da1dacafc9109625e1de124061ca3aa713e82b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "2b2d903b3fd51f2e371d7aed07485ad703ac97714d3cff205c69d2b74c4c921d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "07474ce73181c7744875877f7d02511922da050a6310eb8cd645eec7a2788da4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7f1dc0923c1b3396d15825fa57bcfc59ec9a29f6e80451e5759cf9c38da985ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d969b25dc758c6fbafc2c6565770b339a215c27bc529243cc61ab9358682f380"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e292cab669ea2df096072a16868b20685c6f32ec5b292964015655c036b288a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b23f148b10dab1c8201f33d1566badb8e390cfad0f4d5a1076e211cda02ea26a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "21730738f73c5c5b148b6ca80c5b44e6811762dc97e54e4b021b7bf191ac4c41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "334a8662d98ec7941489fded603a2bc81efe7fe37e4f90f6ab0955fd585ea867"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "410fa3dc445207cc7fc9e7043fa63d57707468197952a660e14dbc99ac79fc6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "4922aa042d713bcff368dc73c7b85225dc3c70b32dba07fddb07a9e3b6c7bcb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8418229a5b6037678bdee35768d57f311e2ed224c02eecb7677dafc0cbd63f58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3ce76f23d4be064c022433a1d865f17fdc27658adb460b5ad8d3a446334a37e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cb0218fe8e26b85a022ac2025760a589341ad3685c82f36941d1f852ccdba6b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "91e889adbe86e5e0f2c1fe586e50a90d7cf115291933f67ec32dca49f584c77f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b3dac11c4966a19cc994cae3dd6f6885aae98b0fa3a6bf98e7259d870a35e2e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "7ae0bdba58ccd95b96642fdbcc08c5b0b7fb493f852aaeb781154eee12042397"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "554545443eb760d57ccdb2a756a9dc20a268206fda77f7fe275489da55045346"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7462782646732bfaa443db7c5bbfc19e4f475f2e96794527df5d4052ec07539b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "57ddaba23e9cfde13c22148b3e61ef40492ac017906faec86081b56ad2a5eced"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "589270169dad7709a57611fb9d15ab20e835b5a5ae9e2a87da76fd183c722693"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bd81142e5170b90a199fb33df43c6b60be78fa61c3c5e03a0b98354d6471c5ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "181b9627019eb7bde84fcbfe0de6b5155bf2a88b591cb0e9c13b5e11bd31a5af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "070ea6ad709119cf7d31423d509e25e0e7cff6d5e652c01f18b1dec9831cc436"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7eee09c5d52e9a90d292d4f330f84c30a58b3924fe5e232d9ecde7873e94b7b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1659c39a86c2a2bb508538c641e0031538c3055471e89d893d343d0ccded2c7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "0d9d6498acaee35e72d070c59eae4cc999b8fba4518eb48b9186cb109ab5b02c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "36c964ed340fc22a85ec0b5d16105e5c9a371f30a9ccf3e0516e1dead7d6c52a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e19faf863694e0183bedadfa89be862e7a8538adbb73af3587363145f2b67022"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "436fe7c7c30be1893e80762c017ec5456ad44dbbd282596592fd277f610be734"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "e8e53157e8d2402165a579c860c43c5a61b25b9b5c3e4e5dd2d808e4d27722ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "7e08fb0d278215481f19e1a7497f7d16b7b026638938dcf5d1ebe91db503417e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f433bb58cfbcb9964ac30bcec20fa61728c3feea648d207658ad123d3313b351"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6de13c035c2d3bc349ef7d2644fdb4663d761f1c9bc24879b0aa5a087172916e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6696e084829754c2fee3e85a600e5f94c0e86ee94ab317b4dad626712537e0d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e7c75814cbdca50752ea4ce72ff74a8a14f9c039d67164ccbc22678c8e2894d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8d5f4baefcfc1b90d5da3e3671fb25f1de46e66d7e13b9a55130e1381a56e008"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "66708df91ef52833448d1f946bfbeb3d03a74858b19a6be855bee67d2df60d40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f412ac17cff96421a24d3d5f518ed8b0fd3a697f0c502c3fa8546f9f7ca2ac01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "63015997e0478d771f5b52fb293aa0d06d1e2441efacab666468750f867b95ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8faa0bd0a63abf9e4a15975e50552bf70382054f7a88bf60c4bf662b2c9084b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ac9ada1965121f33ff25a2e193c9140ed88d4f050b7f951ba682d3c16aa22ca2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b05d8abbae51e9ae009b84eb8ce66bd2b4e69334d82253ff9c0755bbf9ef15a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e4f84554733422dc595c4dfab3c880bf4acb0ccf732a521fe98eab87bd54a06f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "81268a7fe864be3f4b982ec4a796b66cea3556f975e228cbeb3871030d37af96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1bf4285bca91c05e13f868d00915ca0d0df6912664f43c1bcf56cc2bd0a4313b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6f74fc5cbbc5b1d67c4c7f030657abbf8511d54292cd403f823e4bd9574d04dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "6826ea261131f29b0e7459d5ca44f40504e0ffd84384436b5ccc959d0ece4b84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a5f18910667891c8d2f645cedde4ecf0ebba40f1ee119c6bceedd599ca784648"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4b4c4e27d55f7cdb471e603f8433f78235fb6a86810c7cb1fec198350c378e24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5f2ae94f2395540bd8872f8011757afdd16b08e6cf9130d319798c1e239519fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "cf66990c6c95224782fff685f5998eb411796a7d49e28adc7baa79126acc701c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "fb3b06313bc5e2151beeb9e4806665c0cdb713cc0d8775f8595f223186049cea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "05e63dd8691e841e674ace39a3a35e1b7ee763da37b9890b8a780245e8010fcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9c889befd61d27efda91f8cacedb055e0b41c45fcabb88d6e5b16675fdd49ead"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "6d03b6d157ca92d3782e79c8858b8f7c28d6d3e41c58b5b9f8c6c6228a35a4fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "68489ca0d4571f736a5370a52e51da915c60e209db2e71174de74b6410bdfcce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "350058c3fe7ed093e0049e2597e9125dd8f6b420621c8f4be4b22d19ed361e94"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4a2700b7152ad4fb89b7b91d0c715a6fa58c801456ecc9b05e7f10c73f9731f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ed6b20860dcba20fc2c727368d488d2c85e74969808ac9368cd25547d640d147"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2158cf81ae06df1f7a7a89b283312e978bbf88435acb4443335213229cff244e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "87912b2bdf6339f6179e34b1e9c0470d204f2faea71bdf1abf8df7c9e99da822"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4d11b716399c7647827f1fd6501e0f72846496e432530dbe217ceb3fe3ef25a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "3895674d8083516bfe06ba2ac1fdca391c66043e52382c5fad921ac97baf0a4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8c9584c109a73dd9321514e5afa07e78e6eb264e899b644df374ca24955ed4a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1df866c843a08ad3f7bdb1e0c9c5455b8668455aed3c026cb71d5c871a957faa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "bf26bffe4caff1f85a5eb3cb7eb2a579db727a9cdb965dd93a3fbbb1310a2d84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ed18705e7c62b6a9aaea73ecc488f61cc54ef9714635632b13b8e61e29256539"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "50af3b58acdc7438ef0e362bfd3b55505a867d036721b192173ef7f4ec424f34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4a9185c5ac9107d15e8ecaf2ac56fe51218be6f426812916382633f36662558d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e2c79afc3c30ce50889e278704c2d24686db98b3f8bee6bcb2121e8881028077"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "16d3b525a6015c0a6cc97fbf692be2147efc35ff7b3d08987138c6564f1e8d9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0184ea32e6f67491c2a7db0b389fd1fa395fcffbe332da88be453bd2a54916a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "fdc64ee864c03b283c18db04fb35c93a11ad36b2463202673e1ccc97b04f0062"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "23b1a12781197d5d8ccc7991aaded8571c5ba4049a8d9763275b6f43455ff775"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef47280552f602e9baf279181e844471a9e33fd803dc655487133a2dd0a8dc0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "bdc3e33086351daa36faf88384f8ad1d2be247271f25bda681b5fde76e36fa6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "d84d1ef9971b121b0a4a80741962e3b035f2d1b2d5542cdd9cd8f8c3caa290bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "5c17b060041843c194c41bb6cb79ec96aa407f29e4988d8676b0ed0f3def0ac1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "002cb8491d4a52d5c8fd28cdf375d0199ef4fa9de5d014d751b6d97c2e357b3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "685125d950358ad9db696c5f93a738769c8be7123bc3e9c396737df9e9a0fd83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a8940b3f18bafe4b52ccb1e012572e83ab59dc7cfcf15579a8398ba73d6095b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "0d9e9006de4dfd5f0def250959a99e4144654cfca519df12a7ed0e5eeea1aa08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d6d5afead98163a4d8162005fc81c74f20a1a16a66ee3429bf33022c42930967"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0504bc2c68b7b1674ee2f4f9715f63bb7f7f05ffafb374e1d9ffee2311ab4245"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "bbca61241a7d5470c977c6a979f37ffabbababfbcce6b04d4bbbaec8fea886f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a3674a44e52dd7bf455d9029d4b41af5621dca2256abebc0acbe02984226a307"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "196678b86c35615325c9296b2cc6e724697b593301b9f8d97171dd7fb062ea92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "2580fbc5896dedf76e8ae1af6d33cc44480dd337ac8312b0e112f689a9596e1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "07d0a698e20461856728bbb98f20767804be64bd7adb4c141695f56887914ae5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "127877f43df2b516483c5e84fad79c4b0f9f1300c1fd65ac30ebc1de10f68572"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "9a86ecd5d29d82f925ba6386417023f109e615030ec1179e2ab2a979dffac5ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1a57555afe868396a607b68563eebd3f4841afc9d42e19e058b66ed69f17b6eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "da584d56225e6647e0848f48f76a43f93365b361b31e3362e2f0b1974f4d04a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "c81942c37d5ba125201e77a281798ac6be2b3d1e0cfee13ea5cd7a83ed422539"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e2971443adbacc24f246506effec0b0478fcf2aadadfea56fdb945b385ba5f1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "89674128b8fc2760b038446a97b3c5cb39db716da35a76848f2551953b63b688"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "2bcd93fecf10f93a65fa8ebfe9e1c14fead4b5aa60a5dd36760dd231eab1cf49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "efce311aaeacfd31779d869549027c811c7ce59a2082c949a15c21cc638b933b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "16c7bb25f8e8369a41ebf46902312bbdd6479bebe981a82803788354ede97068"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b393eb438affa7a846d2262e2855e78a2e1da71e3b88bed5a445bc85406b8f3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6ebe217e90a0dbfa14c3932b76d4609cf06175e8f8207623942abb7ceb2471a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1d0d7f3bc80f385207f6585ea5054f8096a895789fc0871121b27467e460a6e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d799be43b1a27aa75d3abf43f5fcd0960ff28324b9980c99a9002e97529b230b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "17b4a3bc8ace51f359621f16b7749ded041fffbc2c24a8c208160628de77d0b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "f55c3c102d52e7e0945532254c693927e1144d046822f1fd22688df35176d6ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a2481137a29866f336db568a3e39d7e1101ce0fd2a6de4419b3a5c3aee7b294d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "067a651237d2357ab09b9c0b3ad5632c614ae6d60acbd68ebe52b81b675ff01e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d3ef53cf3152303eeef22139d1066c6122764653d8d0d23c5ec6e30c89aead2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e49f73056ba6879dcd684ed211d684745c211fc5dd2562969cc5ac51cd5a73a0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "03da687e50bd1d8f40307e533af5481d54be21b410bb8fe4fde5907443eab7dd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "25ca5d20d3d6b4d1e8356475c4d92d7c0e40c210bfaebd3b0df58aa968cded68"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f70f9c2fd3a7a21b3dc2a953fc9c00e4eaefd297a5a7289b2c88e47ac0ff44fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6fc9e7c83cc544ba18f1dfe0832981c93e5133709197a1c7bec241874122caa1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "222238a9dd8dec2b913b26c5c55acac8cc44c65651808a1bd57839d5205a2a44"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b3c19f490074286776c65f7f2504c9ee745f586d323e236dd3fb3f287eef78c0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7a0e360ac921d5d6540533e95571230731f1ffc20a2a2aa09f11de156d53eab4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "686092b177ad49080adf04dc6b165da89e44fe9490137b25745ad194177bc368"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "79c395ef232a280234e7ae0212c373c2238dd9fd4bebfe1ae5143d65abb1abdd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c3730cac4f311f6b1d8d8c5b1c026ab3450cd5d10e50a707e068355fc86ed00a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1b9be99d95dc504154ac80d10af9d5fc25ec92c4dc9dfbe259cce853f82ade97"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a11fdb48f481e747fd4a206ff5ce915c1a93f97925ee16607b0cb2972d506994"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3348d66a5b441fbca450a6c2fa870c7fca8247ff3329dfc9a5145f4f179dc65c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "322caf1ad82758bddfb734fca4056ce258580f1b7724357b143cb3751801f42e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "bf60a478fba8cbf16fa8b2beb1f6634a799db0d4c2e5bad2dfb1c2f2040392ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "03449ccd017926be8ac6e88a98cbf81fecac6e1d84bf51a80126dfb24f8f6768"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b0f857ec836447de63562c192ba78b7e6fce1214c80b8bfc3bc28306f73e7b7f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ec503ab4914b8e5e91464cd95c50f6a520464f49b6df4c919fa76b0a2276d49d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f58af9586243cf2fe94bcd0a99ea557cef829ce3a66c976f2b647e4d956fab47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f31ff02a352aeb93df080b77154f209da7775b484c810147fed08f2dc5a80f05"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bc4243ef05beb39c9a82b13f7bce858aa9ab70ad52fc83f0616f313eec7dba0c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "bdc77dd5b8f399dca461a1a2d03ec2e283ca834ef75770a9e73ebe04355451e9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "fd877921c396bea0bf993d767fe9440958628eb30ab4fdc050129bff6f9a9834"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1aad80c4b45c9fd2204116991781f988cece34f0c43132e9d311b62dc2899b82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ab312ef57e71e20e45c67b072d61d323c40cafeb3ecc6a231880584b40c92d28"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6935347240e1fed20b7f3efee678c5bb4bb1512e2c341590e28c91b446818f6c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4ec08a4a82b0b29263b5b073f40c492cd1c238325e4a47de04dc11f621d3e80e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1632ed7994e2c4b0edec50731c97af800fe442185c4e6fc7f1b1ecb4aa0fcb2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "da4000aac82326e4f43ed74e214fc1c7c220bcb70a734ef972fb27013fcc6e7e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c4ce90d8165312712be77cbefacd363c7d0596d83252b15475b3d8df4c4148cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f7a19561eb5de400d392bcaa272b9ca914c522a472e8391c2982d9561573b963"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "120f9b7054c042ac44201bf00954dbe4d55a404b95b4cf4647ca8aca2357c143"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d84389965bcf53cefbaa97b828b98ab58cebdb4d18109fe5cfbe71ad3f30bfcd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "98a73fd759b07673797a883c026050f9ac4fb2e1c8bbe51113667f9d25bccad8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ca62d2f6377a0bd13080a19c5b029f6e024514a7f7cbaee7d98dff5afab8e851"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b9916ed4097e6ec29d93ba1048ee5b4c0bedb9dbe5248d3e4359474894b3cb20"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b48500c7f8e1eb09ef85aece90bab46d0981cc4cbd6bbef5e6e057dd910e2d15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "5baf200a5a104b73c4d511d31b05bac0d637f9c703fa1516859cd8df879cfe1c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f517ec87f78d21dbd197573592aa8d57349a317f1c8915bc8b65be7fa96037a0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6658daa15285fbc165fab4ff3bda0f66aab6f7845021401305afb15f4353aeef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5b3728bab415731a8a37e804f89f2076529198049107b3822faf24a21d51ef41"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a017ce9aa1ecbba833d48a6e0b40113d865002e8d61cd316c1724fbc0d03c9f5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9125866a7abd25751ea0ea21c1ff429180495b55b98925e1230198ba445d9f16"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "25e66da4037f63713773d176ace63c21051c082ef4ed471ac34ed95109c79e50"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2cb6f3750e4c30633c7343f83448eb2065d084de5636af9a39fd1fde4bb24f83"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "3369f08d22c4a4016c937cfe45c7146fa62566d858f58b02365b6201b2803a59"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ff0adbcb6aee973b8270d156940ce34360949b821bf68336b9676ac26224b9c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "661886b7ca4f9f5481df511af4100783a2552d9d9de3b93d420a4d4c5b1c9929"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d5e4c791a7f4d7071d1424fce55f8f80d65996082226ec7a46080351aa78bb5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "037404cf3f2dc03ca7a65a0c4121ad1e69ac8bdbf96fdf48376cfac923835b0a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "546f29535ef7135498d4b0fa1b1b09f0496d4a0f392c32ba121ffd8fdcc3a06c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "9c28c1d10633bd3687c120fa07d8241f76f10921b2b72a9fb23f1d14f7e86074"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "33983e35ec695ed6bdf4108612e1aed6a53e0487651285d76dbc115324c5cfc2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "80271d667ad4caf46c6e44b68cc9cee79e02c4f5ea0435ddaa96422a32dc80aa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bbe90255637d8db42b5b5832f7d90f7b09cfa94808abff928d0906c1d2ed9398"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6a19d2d2dd3ef3e3ef8ef00a9c59069d8e1d1aa4b63bac82f25117b4462d0e86"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "13927d266bcd29c5c0a8af8cfe46d6a2667a990d6ba3dce1c64f4e1c69d2e4cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "36e981355f6b1389cf2e61ec2f9c7ea053760dd95d480da13133ae3b678c261e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "70464c9405651168e735300c90f9a85ed7b96d5d51b1fcc926879474b6f28076"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "97bcf8b9ec0bd409da7e93af6519f19e37b9f9dd99376dcd26d7283a35d88b60"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "dd0fad1a5922c0d8e494e3c2ae3be6827c900f7e97848652c3650b00221d830b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "601c75671c2e8200b30678d647aae9038e16d0d9c4ad853658ef358332d26d65"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f49d73b6c96067b4186c841f7b5a4cbc84c65db3ad00625e28aeaf4c12b879f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a018ea409ee1143ff4a676ee72d468fadfe5c2e3acdd93879b7912e20dcf9f38"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "943af2b3366fb413b08413127f89bb1763c9612c57ac4bc59d0b8a63a9eb90bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4bb6fa7fa231928b0c65629e3c5b05c2d7c5b1ece3ec2ab413d3435a702b2766"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7d638e908db0529dbc6c6c86a6c33deb3a1f217ab351b2fb3f6343093359636d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "fa4b814ab09c41ee4de447afd7a90c3b5675e98ebae5c8e77c9b406776f12e1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "35cd9b42ef3498996a9cb380a6d1d814ce0da1947dd5c58e0bc4d059b623e041"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5126e4f56a25290c2b8d1f330843d6c8591231ae413040062dd8b438bf005011"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "dd046dc7b1845540c18b89c34c41ea18dc1e1a29085602978a09c9a3175212cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "4ad605c2d8ffadbca3d29353705e8c371915a1b0d94b58d670f2306d10bf7b71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "e0b875c5a0ae8e7c2945b48e749b375012f5d22ddb49a7c3e62a059165195f96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "02200c05bbec8791832887523523366175d5784dfbf671128a66a6913370f81d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f728b32ca36a0e6020da9ab1879387d0dfcb6c6e663d0c59d499a952986709b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ad33ab04276b5fcf2ab2f6cefd51335c8e8bcb17f0045b0ba9f06f26d49b74f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e05af8ec5f29ec8d104c977eef753a8b6b93715e97d78fe06faf84f68fbece14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8382b55bdc6f5b96cecc842ae32f63cd7234847e3efbf0b1eab1141422d0e987"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d3811617e9fe603097324687c57c4eaf8c4dc4abadeb18bd2b377230be72e6c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ba2005237f76f5869b0a485bc6626ebbdd8a6ec9bb7744fb4565b3a098244044"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f50f828ceea79bc786c199f580b5a7ec4803e29061d025a5cdcbd614c319c3c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2306bf599265c6e517a243d3b74d5d6f337305f7e8b79641ead03fdf44b0a547"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c19f80eb0dcef908b5b8d1c0ceead9933b5a513485c13a6d4a5c2e14a6fd964f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "887ccfd85a30616c0c58ad4e8eea1476a790a3da4902d7e18cc83c153d234555"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d8589f2c0222821ea9e51bbb819408ad278c4ec9552fdb5e36144bec5a5d24fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "cca3616a20e1b54effad66debe673dfe2141036668a22449930ff6b71ff7354b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "93408263c682a7d80429bb8a1289f16a598bae3284c6fc86a60d22b4912b2d10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b1d6a9cc6b275ae13f705df371e3c449a07b5194583764d431e016f84060dd35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ccf9b978862412809723d35b26edd4646ab5fc7249e19733efa85543c64c9754"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "05bc323d443d200e691312ccdd0d0bf3579e5f9626e218512600d213286c7c37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0817f661035cd9eb3f109bee3aa7a84a12b2c1fd02633318d85fe912bb95f2bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "0b578ed8b90603f4d220b70c0dad81ac23800c342f22b2673f812dd10380c4d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "72ad3c69f07d577491dafb9327197b6288c43769959ab05092feda3dc2d9e847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7cac3b205e7ebf2ce1c507266e822e5d3a2fef248f0089804d400510fb345427"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b4cb07bc41adc8d4b5998321531be4ae3d153104c216c34f243e579b83e56538"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "64dfcc09d1a5ce5859b8e2cde6ba7460bbfac495944e63ae388936c83ecc8c38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "01cb5a933c043b54d53fc63b8e29eac9c16e6d03e0a002909b5ceb112a3e7dc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "15bd9807f0d9176127fd28fd396e070e2631caf3de50d06fee6faf1a294c443d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "599dedc127cd124ea5507968205e5dd8db7fe8c0bbb31a492e669083db3e4da5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "aaccfcdc786772f2b4d2aa33267ceebbf963994c86a236dc6282e6f7838a2d1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e8f010acd8914f1df8ebce4b064b06402c3fa87884b19c3e18cfe5833d2625df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6db593b663e49fd7e95bb998cf04f65f9c0902bedc188c37aa489bf902ffa137"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5b568f1e38749f0ed5411373595e07a0693fa0b2f969ac1d16c69ac7aa29a0ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bfb0403a6bab602491597d0cc806d72d85f9888a3ff0e83530b7ccd85e091bfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b927ab1aa4e7983385acc77ea780fba0d572f63eefc6390979d91cbad59c9762"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4d31561de088e89031e458eb788e2a8b6651aaa7f385167155e5abca5619a501"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "572245ea36afb7ec7caf475b6295856159fab32d3e94f82b5369056ee51cd49f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a32d309b5894248e4d80b75a586701b27d54042a070a9ffd7864f1b271f794bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a6d1f1a2207cfd2fad701a90719ee5524033cafbdb322667eba4b62d1d8bb060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3bc1cef645542b0ae6081cf462bdfc25d50cce04d5a7bee4ff121662ad4f1d58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b6d3766ba4b37945c5275821118b92d09ec4b01374a22ced66a307c5708cd07f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5a211df68939460edf28780d98bd46c8cc286f68529913b2cd19e94781355d03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b879b38edf501952f15b9efde6a19d56a62a47ad2a23b1dc20754f72b33dbbf8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fb629ba8ba51a628be9565c72bdfe3b240d645702a4b48ab2ce674e6516236bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f27c3d176194e9691be59e6d30c8467787da0474a01032d6f318438f7c051fcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bd029c0f24d0e40196794bfb3d44b13a34924126946deff31d7158de4ff96760"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "80736a8cd4ed89b599c0c2c3fc819403ce49518f93bf0ab1e61b3e3a3747e043"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3161c4126c5a24f2d63fb399d3abaf150492ee87f0b91f55c723fb4433ac1b68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1f44e818e546f638a3cc25ae2a674785fd8c255ce1c7dfdbaca673019a877748"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e9617006ac8da9b95a8c7536396a71ccaa5c8c0ee9866e1a23b68e3ff81840e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "90713b6980f809036f9bffc19a765e7bccc0a06ef1c4e020d9aa1790900b944d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "972852ea83791ac60941df16a624c05bbba193ec9b9558f03eacedaad6e27288"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "170081a70beb293b27ad2024e890e5fb342d2585cb8dea99d979a6e952a14431"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ffd149c2802544665f9a83092d6c37cc52083f9e33ddc80b2d2c8ab75a0c457d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0f14f17e08d3bfa3a8e8bc440ec9543573f23ba9b170a609d468fdc2c0918d7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "18609288798de4ad1e664829ecc51fa4d627aac44b3df1a5552c2cb8837e50bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9ced1100fa545cb1fc2d759d845a14f6cb44a44baa82ff8843475ce88742981f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "56212a95dfa3cbcbec9bc040f829f7f9193f441959c84e87751930be2ef01b23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4ebbfe62f0c2986b5af1317ae1ca3c69c4dc9fa5e71830f81eb1f83cbbd2cc5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c8e7802d5bd5709e5997d3d8402d664062d0bf4acecdf1c64d8ea6ae87173fdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f07490d6b0e5b4e6d0d65b9679057f3c78ccc2f6d6cea3902b36a47c1ad69c68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "0b83ac5a5c26ac4e819c7dd66bfbb644c380cafabc9e4966c0441a4b8db71b79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1a6a520543fc30fab1e9a827e44defa4a4e8b9c2b5594ca74db49c585bb514eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4eabd0507943f2c76867f0670ae0a11a4b361cb2d56d88d2f23d83e93d47e1d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f2e42d947bba45db9ca364950e3a005d06a5568643da130763f4d4626713a63d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7e7b5b1d8d9e1ef6e73d44fbbebeb123d25221c108c86ca0c97636de830f1b85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6c8e1561c777c9b18af2e5f499b875804518cdc6e0ced744f0dd8977b275dc46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "305867fedfbc1f2aad8abe488ddf3e2f8d691de02f6b25803b84435a903f9b58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d8ec07ab25fea1a1b5ef5dbb6a27a1ccb5699652176d51b5609c321767f7525b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b3fc1ff1c6ae37fb1e2f2ec64667d171e481e0189a2c5a8547f26fac34cbbf16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "617c150a57938fb315966da841282a655ce7fbcd0f8b2c41fc44593e7bc2f3c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "014ca6ed77e613127001efcc67176c6ac2545c29697a818efbc150b6290c5fb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "67a274c355998846112679a9a3df4476a14ca4836981f3fc81bbd3e74f7ac684"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4bdc8201ed0589e885fed4f9ff2f1442b4e71e4ff957e4637c0940296d3eaf22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "91cc4280ef36620b79a20b88bd893adc3f00299fc169202b6ff64a7c3329260f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "33b0dc47969f6df59bb4619a61b54937e2ab01ac9f6fe94b1181121e6f226dc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9a12f7b88c138d09b4ceeea1a34597273509c3db45e3cc3cfaf58f8981dc03d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bf138563062daa80d416193f0fd60fd4fc2368728f1b63055ef142c4aa3ff44a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "144b003d9392b064ae1cedfa17e1aa4494d4bde0a68ae31cb1412dc9048a8842"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7ea47fe29534ea378ce0e58e1a1bf5f8fc452bb0e3aef0243160fb3e99151cb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "36947c46a3f4853ce4f31de5b01ce8cf9b1bf2491b5c4d3f001fc88815875575"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5ee2238ea7cb134ef6a7abc9e21af083e5a77da10e3756f455dd74dabdca2772"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8f48d5d2fa7f11d7c8090f3a7743fef26069a73cd90b1d19efb1860cf220caee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "17fff9e92e29717e21a489e1f9d024e60732528880643af21ec1fe8735520ad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a9b141746d348a38d6efc062778ae56c72e24305e01ee1fe782524e29006bb21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d948582baf0d45319aa1b03b3f1ee6ef452a9953186ebf407cff36a9407d9aaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d9325d91f51a3e3dd5f143cbfd7605b4aac0c530b933a268b67f59b272bf6b2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "943d5653df28951afad104b64aeb5c410c83b266e3f9242c31524f4274788172"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4d35ff8ae7a8104a77d57ab58d4e1ad12de618e4fc8398700a9918ee231fccdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "06a171df8227c438c0368aa7521e1e18e0aefeadcc1ff231b41757540064377f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "22e4f34ab9adefe4b7fb850086504632db090757d2bd5eb809b29d2b60997d5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "dce999ffbd29eedd728cb4ae831935e6e838680c25fd2145a0c9de1f2b571433"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4d5e5082b19e5afbec71bd03ff0e8bf4c44c9ee7678e42de9309c49cb51f88f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6bede5df69b7825cc90d0fd27ba8de0486b43ac786d5da43cfa29d0b35b53887"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f0d653b84222b3f16f85a0113ed48d6e3bec7d66519a493452515389c616dba9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bf3e8fa1ad57c0f7951aab9f625f99e71b83bc4bf05adf2c5704b34a7ece01c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2ad3f43b2dcad53a568af8937d946a713e1cd7ab7d30981771f9c4559477408b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5b36ae62e6bf5030ff2fe392d876b54cb03de6ff202f306b22ee0f1bf5c052f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a8d195f66957c7c2aa686dc01a6b435c1614ad4166644ea416afaa00dcdda541"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9bf621bb25e5ab8f0019ae0c846301a4e5ce3129e97320204f0ee5dcbde8d736"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "125d24d778d9ecd547d8dd0a73dfb3f05c088aeeb31a1202cb9390a1e8bce033"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a36158f3237850a6104f16ea239f12c52c6f2d60ed12bd7b3669c26c219b3fe0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "79f55f6b44e2ec220cb387bddc648d4af3638dbd3b1d7b33cf13f4769c8e4b38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "795d59adb81386f0045f871c021b430841e2f69c02438652e784811336aa8ee6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e57062e66ff3139600af2023419345b5941aacb2a071da4f705e9e4365c3f867"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a4f7ce2282e3c331c7c0e7d5a9663d643406c9fea67598201a2c4012682e49b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eea0ee5bbd572e0d4d9704a12987c46ebe1b0d55f0cd002d3a6356b7c7194768"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1f19222f99c5a8495c19e27f0d34099ed49dcf71420d224b214195ecc62bc9e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3ac47eeec918d46a1459daf42988e172e25b401ee381b77a71efbb577a8feb8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7526746531d42122bba7e5381892d48e9710e40cb52f0ae5fa9281ce972eaf60"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1e11f7fb2ea0a9087bc584298bfeaba5613ec4c8239cb4aca9d3b2006455263a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7146d11263067316abc32f60026707001e97b6a620ae8afa3a2b85839a4f8832"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4fa24ecf819e4cb01af69d2314681712c50964a82d87d3bb4c2bdd17dfc34db4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1353ddcc42f6cc66f6ee08fb22c45627b861ed3e4084933f843a715174552482"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e4cb0df157e96212b565a79eb487f7935dd2012934b9afb446b063b4fb85f18b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "dbf527a74a2b0766b57535042ca6d6e0b5591ffd06e01921a67b51c5173dbfe9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2f3d6104b59b917948f6b3b5bbccad960e93c12a4a83631b9b07f770684feffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c7c45ea305e37e79be18e17904f16068831a77662126d72417f1e80773e8ee38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7898f08be043d3f006719e68d71f09f79bf473ae34aab19b3ab262a1a4924052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e40d31e2e6bc567267446b097ebcea9d413377af76000d66652e4d632365c314"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "de1da4839c51a07c0b75aa530385b88642ac062916ccc80f5211e87be1ef6deb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8b75f0cd7eec17721b235585540445ada046537f6ccd6a83a185ed415848a149"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c0fa2d6c625517745a0607ca6aa7c84a42e6d316115b673a4fa3bd003305fa50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e6ca692e7a0781ae0586be9e9f1d91f62e4cb14cf33758df690a986263b05b78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "aa264d6bc47cb61cab2f18705bbd938b9f7fb900172521e5fe5c246c409368bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "85316ed101eb192471a1052c0da54c0d638c288391dd7a25dbc02296be7a296c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2d2ec1ba2bec477e82dd86499babf318d7d940a9206a129c9bb756e7a3f359d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "af2e4f5f603c7136a8b15ec33f95c5d442358f8741e0c0ff72ef38626d97bf90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4c239e64009d24898a9c79cb139efa31f48d0b58fb1f6aef4aa7fd22900f7cd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "75d35f3b6edd9093e458734c307f849062eb65e6902b3e323507490e59c524d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a03678184645eec1aba25cfcff50cfeb2aabb4a92538259f3660577bff0cfbc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d0e7d2346fed92a3384390cdee8ef0d40c8bfbc04cbdbf8498db4c812e6d6001"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "360cf65b8814569a997f0b42e464a26f83aaf6edc6995afaf988bc0756ca465b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f297ce8a5eed5adbbe03c30e6d4fa5617eb72de72527f5dd59ff6ef326498a5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e0daf676c0e64e2b4a7d1924764f28a774aaa8a5346a3c5def913bc21b5e630f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c98f6cbed970cf184c9dc2280653ae8d9a3a858755c846fc900918a64577e2a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7c27b56d1d5bfc83e7807c3aad9132f2a8b2c80c663fb326c18f59420dcdf444"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "57b2872d1e919a6226c26a8ed425acd881700122e9f70dcdc7321c5085ded141"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "16f6b52df67f622ebf94c8c8f6ef7f84f9a9bdfbf6ddfd1e0569e14caab40367"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "16fed94e9aa03d508e8179c8ed5c449edf1896202c1e024348851b8f46ef01b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "afcefb117b45ea7fb0980b0c6ad1fe5ac04f61332b9ce5c471e6f1af1e8b43e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "55eb006cefd5749074cfc1558187b64940f1a635eb6564521803d8e7b28b4857"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8fd1fdca9c14c7e8ed9bc58c74ac97fd20aa8ff9036c5be7e915846c058f96df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "87eac59ac6653b8854b609b0b5bdd7e48c100ecf9a7b2129e0c0d495672fd57d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8d92a9fa28640997109579f6f4a0094f7964b79ef1bcbf937c8939867c13e527"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "489b736a717f881eeeb4e18a788c87c4554cfb67f03d6e5f6c4edabd28eea4a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f46d1d0bbfa3e99ffe3621182c2542ac3fc5d252309c6bf94be044af7f98f7e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "49b37bf6b7b3ec563ca55f915efb3bfa4164190d02d26659b475aebb35c7e063"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fbe7a013fb10e9d3198f607569a3a36e0d75db8b80a1246652d8b0af20e75169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c415175a6586c0d8c24e81b0e02687009ed725b0020763ac0d4d884ae578e4b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5983405fd9dc4a016f9d4b5fe0e19b8e249033e1b88047695d52607c2c16a0f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "612fb001a47dd7f5edad06676a521d2909b49d55cce4db01bbd5ecccb3576f57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a4569b9389199160aed3930a80f77dbb667dd762eb2de60a779b9e2138fa3d71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "dacb1f9c1b83875750bd5792c146dc6e5eac6dfa6acb67352902052d68e21611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "70fd20eff5be6bdd3326cb8c2627cdbf414ba87a77b437e1782b45ab6ac4cb35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e491cd65ea0b67d1ece6df623dfb58a766bc6b4ea5100e41dfe49cc8bd3fcdb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d078460c57819b4d0d1c3f5fe6ce73eafa2033f8d7657fb70b1168a533b2b25f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5cdf5c0a903da2d995101012b3fd90c88ace027478357c156176148d93745450"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "eb5d3e145e834d93699189d88a51243c9f08e43c1385bf010b06c239b3dd46c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6d547b157dbad1ee259c650d721e30a8873f000cd6e3dd0256f4a25c4d348bc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a10445f07089ba8c8ad92ea0a70df62a526acc7fac60b735787c001e309f9ff3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "090abf148986123b7f0b15d77374092ef7eb090efbdf48bf44822bd846142c08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d5d7c82855b9d6aeac6707ec6eeed005e90f4a87a74b3909aa59e07a63b1fc4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5f0a48e82eb69fd2193b245e5628a9724e39df758981eebfaa2138551cabe9f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ebf30a181d8315b098205ea7aeb3eb92bae50f7f37700092e40af7bdc3210574"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2041c1bcff893c91e2895180071eda92459aabe6fe634828aca42b782689f75b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5895afc9817af940c56abcf036ec8ff7b9ae03c1f658ed50283024c20568b105"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0edbc421ab05fb39e1b71fc35383b3d550a62c95a4abd293053a7ac9d8745eec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cdccf28ba3cc22d872d050dda3f102e6320aee3878b46348ddf987609d6bfbe7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "be51ea950d9edaee840f0cb1218751ed8470e77f72bb8161a841e8b8b2edcd3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a32c4075650da102f824e322dc713ab303fac0c3363c0374af2748e757935e25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7f4b1a37df4b8dd077a4c522bd6e5f295079a912dbc6cef017c195d7938ddd10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "62fc5c05d9887edc1344f83682f3c6649e7a6a4bd8f94358d28092146fb161f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b6644272548ba651268e6f84ba464585ad313e97d87eb7289c430a17748ba29a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "81206b5f912d7519013979abff703bdfd92db0e72b0ef1e93eda8b60c707426e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "986212c4ca276774db46a551545a6234fbccabb5a5b35bee13f633e22eb45855"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "5ae09c82b34f3db257bdd94e5275314a2c4fedea39bfeb4f73acae00d0cd4ddb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9353795de2e8ebcfed23bd106079ef6d6b0dd5ff96b395056cf30b5088bf61e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fc12ef83fad73825f52ad44468e4e4feac385311fc5054b122b11f6f74044c83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7a652556d591bbefc06b43ec1b5e9eae76f5d0aa6f97ce57b761ba655b96dfd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ce58a1fe839457a8094a2a7def1761fb3619e171a79560dde428359d9f505566"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "0ecf55e3725696d8f4bc52431b5fe5a63ea3eeaf1907839d411ed04ce8419ebf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9fc91cf1354889ec2b67f1553986322ee2bae0024dd86103da264fbda9d9c7e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5269ddaceb4bad03b6f3162c92ddd5b314ce7c936cce4bd71dd5f2dbc408ac92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c29682e11e319e68d1b5a5db1a6438bb583b0b61525fdc8833de09387217959e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "60a7a01fcb7b3cee19c304c31427d2fcb50812132ba01df8e6721d2a22bebfb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1a4e099ceb3379cb53e2783a9f5ef1d18b7cb716bea715db43a45f665473611b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1c0f8fba78ecdcfc7ccbfa223025c77df626f0e349e7301b7f3888a36d204168"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e3878148d8a450894a0fe3935a333f84b2364eb7101ebaf970ff2d90823145f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "00cb11734a5e21e45cc98769fc0079a4868abb84a43c60fe686b654a9ddfb98a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "91aa2e3cdcff6ec2703e411344b62769648b8612db50bc9f2628366c8bae8de6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "39e713b9f51404b56629ba2c03941daf35c9d0266721dc0b5cb08c50d0f7aa17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6c5d39b1589758e2c6eb187ccc528418766ea9d4a06aed309a701fa658c676c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ca428b467f16e18f3268fb9f693ec1e2925cf1119302ad154b209243c0afd679"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d41f7470f967edc52d17067e26fec3569284d9883f5e3bac612367956ac9851a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "abf36a35ca9d3ec7795edca478c803d616ad6f0d1b067421d57b26f63cb72c0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "5b8e951a959928def41a140f3518566b818f827dda71a93dedee738e0db8b0fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "15446a717f8614aa8d0ee6d1fae4935bdfafd3d6495469cf49e13f8abb8e06fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "928b2d2b3f674ea10c909f72d411b8410bac6281b031056b5d890e226e21570b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "16c6f70dabc5ba230f5f0daf37c3cdd5ec79b7ae7bc8bcee0fdf1031786639a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0bf88739565b7f08a38c781a2e49f421a882a9afa81ea1a317336156a1786d42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3ecf158d1ce0468f1b4d018b1f03d4bfa67f52a770c735ded265fe2eaba08ac0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "58343706cda0ade14d3b1a1eb276bcd9ffa294e6f025f9bfa671256adb70fb67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d1acb626847b4bf4822563f4d529c90038adc29240214754a9d614bbda876f6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d39c41027acdbf1113dda57ee66199c39569ffc3af673a0b3f4797e4c6c35811"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "97685d2389b58535baf05dbaa92adb8b0704a1e0db78af281f60ff7d74ebd168"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "66e0a0231587b0f897edfd6bc89cc0ef5263c37f1ec4a77b847852dd1fccccc3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c786f32cde2501b718b3e5aededf4d1adc7d13238d8ad40fbf7487512457b5de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "169586a393b78d5ac46fcddffbeea6adec94914b9cfe5717eb49ae98994b629c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "95cd0219cd0ddb6209349f18ca19549e059ec399c879900bc2924674ae057c25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ae72e413f4496306794c4bd790e8540ca67ec3a570100b6a2d13edee76087bdb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "cc32129b827d55206992311fba216ed7f912567ba55bceeb4e7cee21a97a8f9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "da81af4a26292003aef93cb08da414f3398730fd4f137d7ac09edd5e9062cda6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6b80a93f7cc683151eecf6e769f9d7cc195886dad21a54c2a70c630692292287"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0576977c5a4e52f255af1791fbe613ef7743728ceef8a035b18b2bd349a25220"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ab12f7632e3a639ba2309728c1e922a51e6ad7665adf1f0f38b7846a498ff4c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "55ee7d9b5621a93316f5a1cafee41256070e2c8948ab655d62f15cdbec90005f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9f0250f9e68d3f22cfb9106c0d4e2e814341146edc55e1157397ad19312a79de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "05925425256a3a5e0a4f337d3dd43fc4b405377318cffd59be80e4828fcee543"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7437a6adc4c7de8518028d83839c4fbf26e4a626e78d625968a8f213c7ad5b31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "28270803d084ea77d15d51f711fb53c38a2367d449a97969e9550983fc1a2b63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "69fd313dd39f6aa2638200cec1163d48639ecde21f834d0b0033120dafa8ebbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1443787fb038a4c6ec9032ef9d260bdd1e74cf8abc2654512aed50cba1b62895"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1186bf72e6657767c768a14db6b915f199fd82e907604e8f8afb1a67c3eaa5f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "08cc6797d6827cff8eeb485ec13c6361166df31891d43816ee5748afca068eb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "55fca17259324e689967ea7d82cb6e35e6c0b06b9133e927e4f2007893b74b36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "be5612dbeea96d69c09e31b5112558121b00064ba85041a41adab0d177e55816"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "744f29883c731833ce00790bb00ed6a721464c3010fe2190b579cd6698cece52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6ffce0f9972beb968ece8d23874d220f88929316fe81a1bbd5d37d50e42741bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "aa71cf66265ae9ce0ac7fa1d07be3733f480b48a92fa0a2dba96b72eba227e2b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1f02d316ec3df836c8a262aa5db765cbeedfe4f9616385aa8aee7ffad04fd36e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "560cdd6daaa3a348a20f44dd687c519fb0b4b51a162e855539a2dbd30b317f00"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e2d94183b0e30accc6d6c6bf437b51d612939831134e810b356d793c092bae34"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c116e0b14b2b321cb0a93e7519e80fddc5af3df400f9bdd1920913784821c5c5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bdf5231cbe222e661a67812cdf2829e4f1f25b947714c24bd84559316d96a719"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b06fd3381a60dcbe984a1c1427848858a202c635a89831e0ef628c97af697355"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c8787d5a7df04f52c988c64c7cff9cea3d84be89ba65b13f14bca851202f8d05"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "47184ab746be0b88a2c2a6a418283fce68490f3c47a3c29eb3ca3b0b3dcf0a51"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dc8a670d26ced424fb47058f9e8d9f51dcd4383e86bb5e665f3f56d01f5d9bd0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "963ac970c070c834a078edb72f1f0c1de830917e72b0149ff733e9720f2a69cf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1c65c5a14c9def975371a5476c49c42db4a2d5cd23c2db638b9d7286e0f44dd3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "95c5c9d1bb6023ad115ed8ae8b75ab317c9142a5aabc7c0ec2b39d6950d9887b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ea44f57a56fac0fbaa388f9da2f0e2b071ad6b9b5fca99be4ef2e44cef892aa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "49d7fe4376a9e7411f9f45df2307b418d1766f99e95214347a721eefd916d244"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5679e5d51d637103396228e8719a761361ac15101edba1508550a9e50bb88743"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a0e1b51c0efb8f0864676cc68039c186c13b1f69de9a8d7b816b13a20155158c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "50f346d23f930c1277cf24c293492c10756208061c003ad86622157593a9d1e8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4b1522f0913d209e03a72068f66477a9791d55274f656aeea90ab1208ec73156"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "01ff9dbd2734d11b5f273ee4cfaa04c8cf9bbee796bd2a902877ddceb31439ff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1211da1a72937b17402db25ee79036e39fa00da71ddc8e34ea923857c4ccfc73"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a2120aae610b64ec82e0d4845cdcda650834eb9911e12a77f130766148dd71ff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b7a661a78716067d060185740c7086e6d354244327244981efc34ba619522d84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c6cdd7365d565a5de9d40e8d379d90d499eddc1c6c8110a884e07613acf70054"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ec2887d02d985dd0c782fa6bb6acf8b668c01172b62ed9528a0bb4e549d93a8b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a3d4d6526c3693e71d4ce051f181ac3441ee42409c4160a4300dbd7676ec2e81"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "77442c9fa68fa3d6375d35adf57c5621eef905aecaece965ca9920eedc8f9763"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e17ba02f62bca92562af42ac5fb6ce045be925ba460998a25f5bb24b0036465d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5b1e7caaa771ef24ce28f820a43182d44a46e193d54bf090c3eb0d59d0d949aa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "47fb47089120fa77dd32ffc1f0a8d5d35082392445759a9f0d21aaebda8b5bcb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ab95ed920017c83e85b71a2f3e6cee0c5746210be641228b1e900f8a183bec13"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e68b0aeb811e59d93e3adb75c59685e5cd1f69ec516d822c13dd2996843abd65"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "228c5c5e519329aebf09cef72062ba10dfa2933aaf25c25005b7751296e8165b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "25cb23c05ecdb47b854d94cf84e5ba1c8408c24a4044101f325e22f0ba7e7040"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b05bf9145cad8cbc02ea3fd651789fc441b74c066e7fecd1906cda93e5800843"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a1f05a7c84112c2621e858b3200938ea2d3323cec860bbbc02ebcfdccdd84f6e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3be1542f9365be539de5fc7f8109f332c4bb1c1b10fa305960278b6ab2014df3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "60dd62bba4775f562b14f0998cfd105ba6317f72c13b2f309cbdd9fcb442ba03"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a97c1779cc4ae0bbcbecaf3df4b24ec52c176e5366b801669047b395206f5ee8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eaede5d634bafc9c29d331bbe9ac16ec33763dae345ebc8eb4a42c12b33d123b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "aa099c828f44bf0f8cfdf8f4dbcbf8b521110ab8db36e61a066ca7010cb2bfb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a58ac0e0b913c441fe3cb542bb367af85b94a6b9453b24ee50e8abb2093aed38"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c53ab19beb06d26473d492c04641315b32565f1159814f0a800bfe91b9773a4b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "83c945fe1276e8ad5e7c7883c00c76cf2b2343cbe86c17b15f99590b94037935"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "c7eb2080349acc3860fb9d7702140c19333234c3987f8870bdbe1e53ae703342"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "27e7f095fea12aac97a82e7a74502eb788e7567531507c05f9887da060697d10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "91f80741f4dcb2aedc6bc27ab7145818b998e81bffd12f8235043ea8159bf3f9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "51f5fde6fdc919c0a38aae1f537bbe3ccef479c8d083fca265dccbb911b2d7c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "99204916b4ed12a8884b98570e06c9dba63d8d19bb15024d6ec7a74e421a04e8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "041147f93c30dfe4146b1acbaa232c115d3a47639370d5610ec1d9ae74ae2c09"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1848b3a3bc8894b7145eae3caf9903226030ed8898f1d71a3d4c771a8a5fc4e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e454e547b899dedc442df595198f801d84b98f47cb1a3ff848cce24472429a10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e18d8f859b1b5f368bd09ed50c1e44bc4228b83e740d399e407b9fff88b6c0f0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "e582677e6a79c0e57c3fbcd41321e7082ef5cc33f63e237719817c0d91624e5f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "019056de5fe0a1a9e8456e2b3e4c3807478216b232905a1a22c320c100b18be8"}, -#endif // EXCLUDE_SM_100 -}; +extern TllmGenFmhaKernelMetaInfo const sTllmGenFmhaKernelMetaInfos[]; +extern size_t const sTllmGenFmhaKernelMetaInfosSize; // clang-format on } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp new file mode 100644 index 0000000000..d6e0a71040 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58892823173fd43ae549acccc4821c4eddc1605cce202489b0d1f425ebe279e3 +size 1573155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 624c7833b3..4eb5ac5266 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -17,6 +17,7 @@ #pragma once #include "cuda_runtime_api.h" +#include "tensorrt_llm/common/config.h" #include #include #include @@ -34,8 +35,8 @@ namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -702,12 +703,13 @@ inline TllmGenFmhaKernel const* getTllmFmhaKernels( { #if !defined(EXCLUDE_SM_100) || !defined(EXCLUDE_SM_103) - return TllmFmhaKernelFactory::Get().getKernels(sTllmGenFmhaKernelMetaInfos, - sizeof(sTllmGenFmhaKernelMetaInfos) / sizeof(sTllmGenFmhaKernelMetaInfos[0]), dtypeQ, dtypeKv, dtypeOut, sm); + return TllmFmhaKernelFactory::Get().getKernels( + sTllmGenFmhaKernelMetaInfos, sTllmGenFmhaKernelMetaInfosSize, dtypeQ, dtypeKv, dtypeOut, sm); #else return nullptr; #endif // EXCLUDE_SM_100 } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu index 49f1cdbe88..1a0cca54da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu @@ -16,12 +16,13 @@ #include "fmhaReduction.h" #include "kernelUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -393,4 +394,5 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h index dd771f123e..c717e333c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h @@ -19,9 +19,10 @@ #include "cubin/kernelMetaInfo.h" #include "fmhaRunnerParams.h" #include "kernelParams.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -33,4 +34,5 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp index eca8d18d15..da476d1126 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp @@ -15,14 +15,15 @@ */ #include "fmhaRunner.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" //////////////////////////////////////////////////////////////////////////////////////////////////// -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -76,4 +77,5 @@ size_t TllmGenFmhaRunner::getTotalDeviceMemory() const } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h index 4d2c6f9cb6..b42a61a818 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h @@ -20,10 +20,11 @@ #include "fmhaKernels.h" #include "fmhaRunnerParams.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -59,4 +60,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h index 90907f1352..b43f70b713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -362,4 +363,5 @@ struct TllmGenSelectKernelParams }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h index 7961213f2b..fe33ac5890 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h @@ -26,13 +26,14 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" #include "fmhaRunnerParams.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -854,4 +855,5 @@ struct KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h index 2d08684105..5f4e2f6b71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -170,4 +171,5 @@ inline __device__ void convertToFloatAndAccumulate<__nv_bfloat16, 8>( //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu index bcae09dd36..af267c5901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu @@ -15,12 +15,13 @@ */ #include "prepareCustomMask.h" +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -292,4 +293,5 @@ void runPrepareCustomMask( //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h index 178c104f65..86160a0aea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h @@ -17,9 +17,10 @@ #pragma once #include "cubin/kernelMetaInfo.h" #include "fmhaRunnerParams.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -31,4 +32,5 @@ void runPrepareCustomMask( //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp index 726a2aea7e..cdac59877d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp @@ -24,11 +24,12 @@ #include "KernelRunner.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -231,4 +232,5 @@ void TrtllmGenGemmRunner::selectGemmConfig(int32_t m, int32_t n, int32_t k) } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h index 6bddd8cf3d..904cc8ed84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -57,4 +58,5 @@ private: std::vector mPassingConfigIndices; }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp index 25eb9cd915..b1bc466b47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp @@ -18,12 +18,13 @@ #include "KernelRunner.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "trtllmGen_gatedAct_export/GemmGatedActInterface.h" #include "trtllmGen_gatedAct_export/GemmOptions.h" #include "trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { using namespace gemmGatedAct::gemmGatedAct; @@ -144,4 +145,5 @@ void TrtllmGenGemmGatedActRunner::selectGemmConfig(int32_t m, int32_t n, int32_t } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h index cbd6bada46..7bbb5d9ad3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -56,4 +57,5 @@ private: std::vector mPassingConfigIndices; }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu index f6107d3397..1db236fc47 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu @@ -16,6 +16,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -27,8 +28,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2428,4 +2429,5 @@ INSTANTIATE_invokeCpTransposeToSeqMajor2(__nv_fp8_e4m3); #undef INSTANTIATE_invokeCpTransposeToSeqMajor2 } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h index 57fd40b78c..1a8a7a7139 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/kernels/mlaKernels.h" @@ -25,8 +26,8 @@ #include #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -462,4 +463,4 @@ void invokeCpTransposeToSeqMajor2(T* dst, T const* src, int32_t const* q_seq_len } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu index 2dd6b9206b..5d006ef4a9 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(__nv_bfloat16, __nv_bfloat16, KVLi #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu index 7588cb6e13..2236e205a3 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -30,4 +31,5 @@ INSTANTIATE_ATTENTION_INPUT_PROCESSING(__nv_bfloat16, __nv_fp4_e2m1, KVLinearBuf #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu index a11c03d72f..9ae656040c 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(__nv_bfloat16, __nv_fp8_e4m3, KVLi #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu index b0aae2b69b..eeb063db5d 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(__nv_bfloat16, int8_t, KVLinearBuf #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu index 5ae9090c92..55e3e8756a 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, float, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, float, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu index 48db782612..ba27fff075 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, __nv_fp8_e4m3, KVBlockArray INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, __nv_fp8_e4m3, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu index 495db6c89a..ba25c39448 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, int8_t, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, int8_t, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu index a29bc7e451..ff3d2e87d9 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_PROCESSING(half, __nv_fp4_e2m1, KVLinearBuffer); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu index c0a1f384ed..55f51543c0 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, __nv_fp8_e4m3, KVBlockArray) INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, __nv_fp8_e4m3, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu index 5d886bd817..5abd544359 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, half, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, half, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu index ac9da4fa99..65f51b2f14 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, int8_t, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, int8_t, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index 32facc70c5..053bf5114f 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -18,6 +18,7 @@ // Separate from unfusedAttentionKernel to accelerate compiling. #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -30,8 +31,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1865,4 +1866,5 @@ void invokeUpdateSparseKvCacheAfterFmha(QKVPreprocessingParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp index b588838c92..945e68a7ea 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp @@ -15,11 +15,13 @@ */ #include "ipcsocket.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include #include #include + #if ENABLE_MULTI_DEVICE namespace tensorrt_llm::runtime::ub { @@ -300,4 +302,5 @@ ipcSocketResult_t ipcSocketSendFd(IpcSocketHandle* handle, int const sendFd, int return ipcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash); } } // namespace tensorrt_llm::runtime::ub + #endif diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp index 2e3e6dde66..7fde40dbc7 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp @@ -26,7 +26,7 @@ UserBufferAllocator& UserBufferAllocator::Instance() return _; } -void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig) +void UserBufferAllocator::initialize(::tensorrt_llm::runtime::WorldConfig const& worldConfig) { if (!isInitialized()) { diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h index 05a4b6dd4e..d9e3494a44 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h @@ -20,6 +20,7 @@ #include "nccl.h" #include "userbuffers.h" #else + using ncclWindow_t = void*; #endif @@ -56,7 +57,7 @@ public: UserBufferAllocator() = default; - virtual void initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig); + virtual void initialize(::tensorrt_llm::runtime::WorldConfig const& worldConfig); bool isInitialized(); UBBuffer allocate(size_t bytes); void deallocate(void* addr); @@ -70,7 +71,7 @@ private: protected: std::vector mBuffers; bool mIsInitialized; - tensorrt_llm::runtime::WorldConfig mWorldConfig; + ::tensorrt_llm::runtime::WorldConfig mWorldConfig; }; #else diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp index 6d5f62b260..3e19f9ebe7 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "ub_interface.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include #include @@ -21,7 +22,7 @@ #if ENABLE_MULTI_DEVICE namespace tensorrt_llm::runtime::ub { -void ub_initialize(tensorrt_llm::runtime::WorldConfig const& world_config) +void ub_initialize(::tensorrt_llm::runtime::WorldConfig const& world_config) { UserBufferAllocator::Instance().initialize(world_config); } @@ -30,7 +31,7 @@ void ub_initialize(int tp_size) { int num_devices; TLLM_CUDA_CHECK(cudaGetDeviceCount(&num_devices)); - tensorrt_llm::runtime::WorldConfig world_config(tp_size, 1, 1, COMM_SESSION.getRank(), num_devices); + ::tensorrt_llm::runtime::WorldConfig world_config(tp_size, 1, 1, COMM_SESSION.getRank(), num_devices); UserBufferAllocator::Instance().initialize(world_config); } @@ -71,10 +72,13 @@ bool ub_supported() } }; // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub -{ using namespace tensorrt_llm::runtime::ub; +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub +{ + void allreduce2_userbuff_inplace_launcher(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream) { @@ -115,11 +119,14 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_launcher(int const handler, si scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, dataType, comm, stream); } -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END + #else namespace tensorrt_llm::runtime::ub { -void ub_initialize(tensorrt_llm::runtime::WorldConfig const& world_config) {} +void ub_initialize(::tensorrt_llm::runtime::WorldConfig const& world_config) {} void ub_initialize(int tp_size) {} @@ -151,10 +158,12 @@ bool ub_supported() } }; // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub -{ using namespace tensorrt_llm::runtime::ub; +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub +{ void allreduce2_userbuff_inplace_launcher(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream) { @@ -182,5 +191,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_launcher(int const handler, si { return 0; } -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END #endif diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h index a33dd0ac58..e8a48e2c68 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h @@ -15,13 +15,14 @@ */ #pragma once #include "cuda_runtime.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/dataType.h" #include "ub_allocator.h" namespace tensorrt_llm::runtime::ub { -void ub_initialize(tensorrt_llm::runtime::WorldConfig const& world_config); +void ub_initialize(::tensorrt_llm::runtime::WorldConfig const& world_config); void ub_initialize(int tp_size); bool ub_is_initialized(); UBBuffer ub_allocate(size_t bytes); @@ -31,9 +32,13 @@ communicator* ub_comm(); bool ub_supported(); }; // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub { -using namespace tensorrt_llm::runtime::ub; + +using ::tensorrt_llm::runtime::ub::communicator; + void allreduce2_userbuff_inplace_launcher(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream = 0); @@ -53,4 +58,6 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_launcher(int const handler, si int const out_handler, size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in, void* residual_out, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp index daba59b35a..be4d5e0c2e 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp @@ -123,7 +123,7 @@ void ub_free(void* ptr) } } // namespace -int create_communicator_grouped2(communicator** comm, tensorrt_llm::runtime::WorldConfig const& world_config) +int create_communicator_grouped2(communicator** comm, ::tensorrt_llm::runtime::WorldConfig const& world_config) { *comm = (communicator*) malloc(sizeof(communicator)); diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu index 52956d9f9e..8cb5814e03 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu @@ -14,13 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/quantization.cuh" #include "userbuffers.h" #include "utils.h" -namespace tensorrt_llm::kernels::ub +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub { -using namespace tensorrt_llm::runtime::ub; #define MAX_THREADS 1024 #define TIMEOUT 200000000000ull @@ -1953,4 +1955,6 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl"); } } -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h index 9751f969d5..96f21b7482 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h @@ -97,7 +97,7 @@ struct communicator }; using communicator = struct communicator; -int create_communicator_grouped2(communicator** comm, tensorrt_llm::runtime::WorldConfig const& world_config); +int create_communicator_grouped2(communicator** comm, ::tensorrt_llm::runtime::WorldConfig const& world_config); /* creates communicator with allreduce1 to happen in datagpus x datanodes groups, allreduce2 to happen in tensorgpus x tensor nodes, @@ -114,9 +114,11 @@ int register_user_buffer_collective(void** gpubuff, size_t bytes, communicator* void destroy_communicator(communicator* comm); } // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub { -using namespace tensorrt_llm::runtime::ub; +using namespace ::tensorrt_llm::runtime::ub; void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream = 0); // for TP-parallelism, only single node is implemented @@ -137,4 +139,6 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in, void* residual_out, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h index c8228f7d1c..c8c5f10f8a 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -102,4 +103,5 @@ struct Params }; } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h index 463f3f7fe2..0bb32bdca6 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -79,4 +80,5 @@ struct I2FConverter } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu index 94488579ec..c60d8f9d88 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu @@ -15,11 +15,12 @@ */ #include "cutlass/numeric_conversion.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm @@ -330,4 +331,5 @@ bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream) } // namespace cuda_core_gemm } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h index dd4a72d1b8..eb939b57c2 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" @@ -35,8 +36,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm @@ -95,4 +96,5 @@ struct Params bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream); } // namespace cuda_core_gemm } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu index 5752c79332..1d208a293b 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu @@ -15,12 +15,13 @@ */ #include "cutlass/numeric_conversion.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm_nvfp4 @@ -290,4 +291,5 @@ bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream) } // namespace cuda_core_gemm_nvfp4 } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h index 2e37196d0d..d47d37c06a 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" @@ -35,8 +36,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm_nvfp4 @@ -78,4 +79,5 @@ struct Params bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream); } // namespace cuda_core_gemm_nvfp4 } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h index 19dd66fa87..766d379112 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h @@ -15,10 +15,11 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -145,4 +146,5 @@ struct KernelDetails } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu index 8804da4e52..96aa3e0d91 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace smooth_quant @@ -172,4 +173,5 @@ template void int8_sq_launcher<__nv_bfloat16>(Params& params, cudaStream_t s); #endif } // namespace smooth_quant } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h index fa247e279a..d33e6a331d 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include #include @@ -25,8 +26,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace smooth_quant @@ -60,4 +61,5 @@ template void int8_sq_launcher(Params& params, cudaStream_t s); } // namespace smooth_quant } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h index de4a960e14..be95976465 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h @@ -16,11 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -159,4 +160,5 @@ void exec_kernel(Params& params, cudaStream_t s) } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h index 8a44f8aeaf..05bdcfab6c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h @@ -16,11 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -171,4 +172,5 @@ void select_gs(Params& params, cudaStream_t s) KernelDetails>(Params & params, cudaStream_t s); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu index 75fe733145..1c1324d33f 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu index 02892bcf72..26d856258c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu index 42d984c49f..6af1e8dc96 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu index e1080ee620..9fd295a594 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int4PerChannel, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu index 41f69e246c..9c97b82d57 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu index 6c718b24a9..adf02fcd45 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu index 44d6ebbaf3..31f7e4115c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8PerChannel, BF16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu index 7cee8ee139..29725cfe9c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8PerChannel, BF16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu index 555f2db582..1662999bc4 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu index e392da50da..371bcd73a3 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu index 6a77b98cf3..6bbec17ccc 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu index 08034547da..51ff227805 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu index 8a3d0ee94a..eb0d3fb7ce 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu index fa5002ae05..33225d078b 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu index f8eeb0dfd9..0b66b130bd 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu index 626e99bc50..d6932b9348 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h index 0ca925d3a5..4562562754 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h @@ -15,12 +15,13 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -112,4 +113,5 @@ inline bool is_supported(int arch, KernelType kernel_type) } } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h index 4e660f0d60..2d5d2a2ee7 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h @@ -16,11 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -330,4 +331,5 @@ private: }; } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp index 9bc7513aea..458b6983d8 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp @@ -15,6 +15,7 @@ */ #include "xqaDispatcher.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include "tensorrt_llm/kernels/sparseAttentionKernels.h" @@ -38,7 +39,9 @@ constexpr inline T roundUp(T a, T b) } // namespace -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace @@ -538,4 +541,6 @@ void XqaDispatcher::run( //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.h b/cpp/tensorrt_llm/kernels/xqaDispatcher.h index 784b30eda8..8888beddb8 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.h +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" @@ -25,7 +26,9 @@ using namespace tensorrt_llm::common; using tensorrt_llm::common::op::UniqPtrWNullCopy; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -114,4 +117,6 @@ constexpr uint32_t xqaMlaCgaXBufSize = 8704 * 2; //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp index 8688f8e79c..b6e42df465 100644 --- a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp @@ -16,6 +16,7 @@ */ #include "bindings.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/userbuffers/ub_interface.h" #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h" #include "tensorrt_llm/nanobind/common/customCasters.h" @@ -24,7 +25,9 @@ namespace nb = nanobind; namespace tub = tensorrt_llm::runtime::ub; -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { void UserBufferBindings::initBindings(nb::module_& m) @@ -49,4 +52,6 @@ void UserBufferBindings::initBindings(nb::module_& m) m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager, nb::call_guard()); } -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h index 15728bf6c1..6956aac5bd 100644 --- a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h +++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h @@ -17,14 +17,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include + namespace nb = nanobind; -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { class UserBufferBindings { public: static void initBindings(nb::module_& m); }; -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp index 58f4bfa85c..743df47309 100644 --- a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp @@ -16,13 +16,16 @@ */ #include "bindings.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/userbuffers/ub_interface.h" #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h" namespace py = pybind11; namespace tub = tensorrt_llm::runtime::ub; -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { void UserBufferBindings::initBindings(pybind11::module_& m) @@ -47,4 +50,6 @@ void UserBufferBindings::initBindings(pybind11::module_& m) m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager, py::call_guard()); } -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h index 3a8fba2cc6..1895dc7543 100644 --- a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h +++ b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h @@ -17,14 +17,19 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/pybind/common/customCasters.h" #include -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { class UserBufferBindings { public: static void initBindings(pybind11::module_& m); }; -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp index 916062d3cd..3fcb38822a 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp @@ -127,7 +127,7 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step, auto batchSlotsRange = BufferRange(*dInput.batchSlots); for (auto batchSlot : batchSlotsRange) { - TensorPtr finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1); + ::TensorPtr finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1); bufferManager.setZero(*finishedStepsSlice); } } diff --git a/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp b/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp index b94674f1ca..940d59258c 100644 --- a/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp +++ b/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp @@ -23,6 +23,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -93,6 +95,8 @@ void indexer_k_cache_scatter_op(th::Tensor const& k_fp8_bytes, th::Tensor const& } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -102,5 +106,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("indexer_k_cache_scatter_op", &torch_ext::indexer_k_cache_scatter_op); + m.impl("indexer_k_cache_scatter_op", &tensorrt_llm::torch_ext::indexer_k_cache_scatter_op); } diff --git a/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp b/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp index 8a5003238c..d5a1917fbd 100644 --- a/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp +++ b/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp @@ -31,6 +31,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -118,8 +120,11 @@ void indexer_topk_prefill(th::Tensor const& logits, th::Tensor const& row_starts indices.data_ptr(), num_rows, num_columns, static_cast(logits_stride_0), static_cast(logits_stride_1), static_cast(index_topk), stream); } + } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -129,7 +134,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("indexer_topk_decode", &torch_ext::indexer_topk_decode); + m.impl("indexer_topk_decode", &tensorrt_llm::torch_ext::indexer_topk_decode); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -141,5 +146,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("indexer_topk_prefill", &torch_ext::indexer_topk_prefill); + m.impl("indexer_topk_prefill", &tensorrt_llm::torch_ext::indexer_topk_prefill); } diff --git a/cpp/tensorrt_llm/thop/allgatherOp.cpp b/cpp/tensorrt_llm/thop/allgatherOp.cpp index 0ce8d99e58..0d92aa9669 100644 --- a/cpp/tensorrt_llm/thop/allgatherOp.cpp +++ b/cpp/tensorrt_llm/thop/allgatherOp.cpp @@ -35,6 +35,8 @@ using tensorrt_llm::pg_utils::PgHelper; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { #if ENABLE_MULTI_DEVICE @@ -286,6 +288,8 @@ std::vector allgather_list_pg(torch::TensorList input_list, torch } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("allgather(Tensor input, SymInt[]? sizes, int[] group) -> Tensor"); @@ -300,8 +304,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("allgather", &torch_ext::allgather); - m.impl("allgather_pg", &torch_ext::allgather_pg); - m.impl("allgather_list", &torch_ext::allgather_list); - m.impl("allgather_list_pg", &torch_ext::allgather_list_pg); + m.impl("allgather", &tensorrt_llm::torch_ext::allgather); + m.impl("allgather_pg", &tensorrt_llm::torch_ext::allgather_pg); + m.impl("allgather_list", &tensorrt_llm::torch_ext::allgather_list); + m.impl("allgather_list_pg", &tensorrt_llm::torch_ext::allgather_list_pg); } diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp index fbd60d1ec5..c753242518 100644 --- a/cpp/tensorrt_llm/thop/allreduceOp.cpp +++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp @@ -65,6 +65,8 @@ using tensorrt_llm::pg_utils::get_world_pg; using tensorrt_llm::pg_utils::get_local_pg; using tensorrt_llm::pg_utils::PgHelper; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -1528,6 +1530,8 @@ std::vector mnnvlFusionAllReduce(torch::Tensor& input, torch::opt } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -1591,11 +1595,11 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mnnvl_fusion_allreduce", &torch_ext::mnnvlFusionAllReduce); - m.impl("allreduce", &torch_ext::allreduce_raw); - m.impl("allreduce_pg", &torch_ext::allreduce_pg); - m.impl("moe_allreduce", &torch_ext::moe_allreduce); - m.impl("moe_finalize_allreduce", &torch_ext::moe_finalize_allreduce); + m.impl("mnnvl_fusion_allreduce", &tensorrt_llm::torch_ext::mnnvlFusionAllReduce); + m.impl("allreduce", &tensorrt_llm::torch_ext::allreduce_raw); + m.impl("allreduce_pg", &tensorrt_llm::torch_ext::allreduce_pg); + m.impl("moe_allreduce", &tensorrt_llm::torch_ext::moe_allreduce); + m.impl("moe_finalize_allreduce", &tensorrt_llm::torch_ext::moe_finalize_allreduce); } TORCH_LIBRARY_IMPL(trtllm, CPU, m) diff --git a/cpp/tensorrt_llm/thop/alltoallOp.cpp b/cpp/tensorrt_llm/thop/alltoallOp.cpp index fdc691575b..61c09466db 100644 --- a/cpp/tensorrt_llm/thop/alltoallOp.cpp +++ b/cpp/tensorrt_llm/thop/alltoallOp.cpp @@ -30,6 +30,8 @@ #include #endif // ENABLE_MULTI_DEVICE +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { #if ENABLE_MULTI_DEVICE @@ -119,6 +121,8 @@ std::vector alltoall_helix( } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("alltoall_helix(Tensor[] input_list, int[] group, int? num_lists) -> Tensor[]"); @@ -126,5 +130,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("alltoall_helix", &torch_ext::alltoall_helix); + m.impl("alltoall_helix", &tensorrt_llm::torch_ext::alltoall_helix); } diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp index cbb498fcf8..1fb1ce1d62 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.cpp +++ b/cpp/tensorrt_llm/thop/attentionOp.cpp @@ -29,6 +29,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using tensorrt_llm::common::op::AttentionOp; @@ -964,7 +966,9 @@ bool attention_supports_nvfp4_output(int64_t const num_heads, int64_t const num_ } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.def("attention_supports_nvfp4_output", &torch_ext::attention_supports_nvfp4_output); + m.def("attention_supports_nvfp4_output", &tensorrt_llm::torch_ext::attention_supports_nvfp4_output); } diff --git a/cpp/tensorrt_llm/thop/attentionOp.h b/cpp/tensorrt_llm/thop/attentionOp.h index d15a33d528..712f7b9257 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.h +++ b/cpp/tensorrt_llm/thop/attentionOp.h @@ -19,6 +19,10 @@ #include #include +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -69,3 +73,5 @@ void attention(torch::Tensor q, std::optional k, std::optional mla_bmm2_scale, std::optional quant_q_buffer); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/causalConv1dOp.cpp b/cpp/tensorrt_llm/thop/causalConv1dOp.cpp index 9201cdb7e3..0d4a13672b 100644 --- a/cpp/tensorrt_llm/thop/causalConv1dOp.cpp +++ b/cpp/tensorrt_llm/thop/causalConv1dOp.cpp @@ -24,6 +24,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -289,6 +291,8 @@ void causalConv1dUpdate(at::Tensor const& x, at::Tensor const& conv_state, at::T } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -315,6 +319,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("causal_conv1d_fwd", &torch_ext::causalConv1dFwd); - m.impl("causal_conv1d_update", &torch_ext::causalConv1dUpdate); + m.impl("causal_conv1d_fwd", &tensorrt_llm::torch_ext::causalConv1dFwd); + m.impl("causal_conv1d_update", &tensorrt_llm::torch_ext::causalConv1dUpdate); } diff --git a/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp b/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp index 5cbd2ba0de..a3ddc746e4 100644 --- a/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp +++ b/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp @@ -19,6 +19,8 @@ namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLengthsTensor, @@ -81,5 +83,8 @@ void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLen } // namespace torch_ext -static auto convert_spec_decoding_mask_to_packed_mask = torch::RegisterOperators( - "tensorrt_llm::convert_spec_decoding_mask_to_packed_mask", &torch_ext::convertSpecDecodingMaskToPackedMask); +TRTLLM_NAMESPACE_END + +static auto convert_spec_decoding_mask_to_packed_mask + = torch::RegisterOperators("tensorrt_llm::convert_spec_decoding_mask_to_packed_mask", + &tensorrt_llm::torch_ext::convertSpecDecodingMaskToPackedMask); diff --git a/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp index 02eae46d74..77ad23c0ab 100644 --- a/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp @@ -27,6 +27,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -427,10 +429,12 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("CublasLtFP4GemmRunner") + m.class_("CublasLtFP4GemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::CublasLtFP4GemmRunner::runGemm) - .def("get_num_heuristic_algos", &torch_ext::CublasLtFP4GemmRunner::getNumHeuristicAlgos); + .def("run_gemm", &tensorrt_llm::torch_ext::CublasLtFP4GemmRunner::runGemm) + .def("get_num_heuristic_algos", &tensorrt_llm::torch_ext::CublasLtFP4GemmRunner::getNumHeuristicAlgos); } diff --git a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp index 8baeba022b..ddf8024b91 100644 --- a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp @@ -29,6 +29,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -377,6 +379,8 @@ Tensor cublas_mm(Tensor const& mat_a, Tensor const& mat_b, std::optional #include +#include "tensorrt_llm/common/config.h" + namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { th::Tensor& cublas_mm_out( @@ -34,3 +38,5 @@ th::Tensor cublas_scaled_mm(th::Tensor const& mat_a, th::Tensor const& mat_b, th th::Tensor cublas_scaled_mm_out(th::Tensor const& mat_a, th::Tensor const& mat_b, th::Tensor const& scale_a, th::Tensor const& scale_b, std::optional const& bias, th::Tensor& out); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp b/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp index 8a8ddb32e2..bcd9d9d62e 100644 --- a/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp +++ b/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp @@ -24,6 +24,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -131,6 +133,8 @@ Tensor cuda_core_nvfp4_gemm(Tensor const& mat_a, Tensor const& mat_b, Tensor con } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -140,5 +144,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("cuda_core_nvfp4_gemm", &torch_ext::cuda_core_nvfp4_gemm); + m.impl("cuda_core_nvfp4_gemm", &tensorrt_llm::torch_ext::cuda_core_nvfp4_gemm); } diff --git a/cpp/tensorrt_llm/thop/cudaScaledMM.cpp b/cpp/tensorrt_llm/thop/cudaScaledMM.cpp index 60a7358f5a..db4713f60e 100644 --- a/cpp/tensorrt_llm/thop/cudaScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cudaScaledMM.cpp @@ -24,6 +24,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -121,6 +123,8 @@ Tensor cuda_scaled_mm(Tensor const& mat_a, Tensor const& mat_b, Tensor const& sc } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -130,5 +134,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("cuda_scaled_mm", &torch_ext::cuda_scaled_mm); + m.impl("cuda_scaled_mm", &tensorrt_llm::torch_ext::cuda_scaled_mm); } diff --git a/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp b/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp index 81fb4acf9c..e7f0164ab3 100644 --- a/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp +++ b/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp @@ -22,6 +22,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { template @@ -121,6 +123,8 @@ std::tuple default_moe_routing_op( } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -130,7 +134,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("renorm_moe_routing_op", &torch_ext::renorm_moe_routing_op); + m.impl("renorm_moe_routing_op", &tensorrt_llm::torch_ext::renorm_moe_routing_op); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -142,5 +146,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("default_moe_routing_op", &torch_ext::default_moe_routing_op); + m.impl("default_moe_routing_op", &tensorrt_llm::torch_ext::default_moe_routing_op); } diff --git a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp index 54c45031a1..770c1459f9 100644 --- a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp +++ b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp @@ -20,6 +20,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // Sort @@ -473,6 +475,8 @@ torch::Tensor moe_gelu(torch::Tensor const& input, torch::Tensor const& tile_idx } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -503,12 +507,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_topk_sort", &torch_ext::moe_topk_sort); - m.impl("moe_sort", &torch_ext::moe_sort); - m.impl("moe_permute", &torch_ext::moe_permute); - m.impl("moe_unpermute", &torch_ext::moe_unpermute); - m.impl("moe_output_memset_inplace", &torch_ext::moe_output_memset_inplace); - m.impl("moe_swiglu", &torch_ext::moe_swiglu); - m.impl("moe_swiglu_nvfp4_quantize", &torch_ext::moe_swiglu_nvfp4_quantize); - m.impl("moe_gelu", &torch_ext::moe_gelu); + m.impl("moe_topk_sort", &tensorrt_llm::torch_ext::moe_topk_sort); + m.impl("moe_sort", &tensorrt_llm::torch_ext::moe_sort); + m.impl("moe_permute", &tensorrt_llm::torch_ext::moe_permute); + m.impl("moe_unpermute", &tensorrt_llm::torch_ext::moe_unpermute); + m.impl("moe_output_memset_inplace", &tensorrt_llm::torch_ext::moe_output_memset_inplace); + m.impl("moe_swiglu", &tensorrt_llm::torch_ext::moe_swiglu); + m.impl("moe_swiglu_nvfp4_quantize", &tensorrt_llm::torch_ext::moe_swiglu_nvfp4_quantize); + m.impl("moe_gelu", &tensorrt_llm::torch_ext::moe_gelu); } diff --git a/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp b/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp index c9b05bb3d5..b314cb4d16 100644 --- a/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp @@ -35,6 +35,8 @@ using tensorrt_llm::kernels::internal_cutlass_kernels::CutlassLowLatencyFp8GemmR using tensorrt_llm::kernels::internal_cutlass_kernels::LowLatencyCutlassGemmConfig; using tensorrt_llm::kernels::internal_cutlass_kernels::KernelScheduleType; #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -171,6 +173,8 @@ Tensor cutlass_scaled_mm(Tensor const& mat_a, Tensor const& mat_b, Tensor const& } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -180,5 +184,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("cutlass_scaled_mm", &torch_ext::cutlass_scaled_mm); + m.impl("cutlass_scaled_mm", &tensorrt_llm::torch_ext::cutlass_scaled_mm); } diff --git a/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp b/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp index 9d8bb5de35..c16f16a680 100644 --- a/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp +++ b/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp @@ -24,6 +24,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { th::Tensor dsv3_fused_a_gemm_op(th::Tensor const& mat_a, th::Tensor const& mat_b, std::optional const& bias, @@ -85,6 +87,8 @@ th::Tensor dsv3_fused_a_gemm_op(th::Tensor const& mat_a, th::Tensor const& mat_b } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("dsv3_fused_a_gemm_op(Tensor mat_a, Tensor mat_b, Tensor? bias, ScalarType? out_dtype) -> (Tensor out)"); @@ -92,5 +96,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("dsv3_fused_a_gemm_op", &torch_ext::dsv3_fused_a_gemm_op); + m.impl("dsv3_fused_a_gemm_op", &tensorrt_llm::torch_ext::dsv3_fused_a_gemm_op); } diff --git a/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp b/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp index 39657c71e7..ff28f2004f 100644 --- a/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp +++ b/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp @@ -38,6 +38,8 @@ namespace tk = tensorrt_llm::kernels; namespace tc = tensorrt_llm::common; namespace tr = tensorrt_llm::runtime; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -308,6 +310,8 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim + } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -356,5 +360,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mla_rope_generation", &torch_ext::MLARopeGeneration); + m.impl("mla_rope_generation", &tensorrt_llm::torch_ext::MLARopeGeneration); } diff --git a/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp b/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp index 89ead8cade..6764cbef64 100644 --- a/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp +++ b/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp @@ -24,6 +24,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -106,6 +108,8 @@ th::Tensor dsv3_router_gemm_op(th::Tensor const& mat_a, th::Tensor const& mat_b, } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("dsv3_router_gemm_op(Tensor mat_a, Tensor mat_b, Tensor? bias, ScalarType? out_dtype) -> (Tensor out)"); @@ -113,5 +117,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("dsv3_router_gemm_op", &torch_ext::dsv3_router_gemm_op); + m.impl("dsv3_router_gemm_op", &tensorrt_llm::torch_ext::dsv3_router_gemm_op); } diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp index f9e0e76a46..8e9e817bbb 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp @@ -33,6 +33,8 @@ namespace tr = tensorrt_llm::runtime; namespace tl = tensorrt_llm::layers; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -452,8 +454,10 @@ th::Tensor DynamicDecodeOp::forward( } // namespace torch_ext +TRTLLM_NAMESPACE_END + static auto trtllmGptContextDecoderTHS - = torch::jit::class_("trtllm", "DynamicDecodeOp") + = torch::jit::class_("trtllm", "DynamicDecodeOp") .def(torch::jit::init()) - .def("setup", &torch_ext::DynamicDecodeOp::setup) - .def("forward", &torch_ext::DynamicDecodeOp::forward); + .def("setup", &tensorrt_llm::torch_ext::DynamicDecodeOp::setup) + .def("forward", &tensorrt_llm::torch_ext::DynamicDecodeOp::forward); diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h index 533066cc2a..c8f4fa807d 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/layers/dynamicDecodeLayer.h" #include "tensorrt_llm/runtime/iTensor.h" @@ -21,6 +22,8 @@ namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -158,3 +161,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp index f2255604e2..6d47a76021 100644 --- a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp +++ b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp @@ -41,6 +41,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -271,10 +273,12 @@ int64_t finegrainedMixedDtypeGemmRunner::getNumConfigs() const } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("finegrainedMixedDtypeGemmRunner") + m.class_("finegrainedMixedDtypeGemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::finegrainedMixedDtypeGemmRunner::runGemm) - .def("get_num_configs", &torch_ext::finegrainedMixedDtypeGemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::finegrainedMixedDtypeGemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::finegrainedMixedDtypeGemmRunner::getNumConfigs); } diff --git a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h index 5bda7be3eb..e8a11d2bdc 100644 --- a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h +++ b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h @@ -18,9 +18,12 @@ #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -44,3 +47,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp b/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp index 57d8f6609c..5fa8d8637e 100644 --- a/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp +++ b/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp @@ -14,10 +14,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/mathUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h" #include "tensorrt_llm/thop/thUtils.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using torch::Tensor; @@ -177,12 +180,14 @@ Tensor pack_fmha_mask_by_input( } // namespace torch_ext +TRTLLM_NAMESPACE_END + //////////////////////////////////////////////////////////////////////////////////////////////////// // Utility methods. -static auto pack_fmha_mask_by_type - = torch::RegisterOperators("tensorrt_llm::pack_fmha_mask_by_type", &torch_ext::pack_fmha_mask_by_type); +static auto pack_fmha_mask_by_type = torch::RegisterOperators( + "tensorrt_llm::pack_fmha_mask_by_type", &tensorrt_llm::torch_ext::pack_fmha_mask_by_type); // Utility methods. -static auto pack_fmha_mask_by_input - = torch::RegisterOperators("tensorrt_llm::pack_fmha_mask_by_input", &torch_ext::pack_fmha_mask_by_input); +static auto pack_fmha_mask_by_input = torch::RegisterOperators( + "tensorrt_llm::pack_fmha_mask_by_input", &tensorrt_llm::torch_ext::pack_fmha_mask_by_input); diff --git a/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp b/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp index 01368ee384..9ecda1a884 100644 --- a/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp +++ b/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp @@ -24,6 +24,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // self: [B, M, K], fp16/bf16/fp8_quantized @@ -99,6 +101,8 @@ std::tuple fp4_batched_quantize( } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -108,5 +112,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_batched_quantize", &torch_ext::fp4_batched_quantize); + m.impl("fp4_batched_quantize", &tensorrt_llm::torch_ext::fp4_batched_quantize); } diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp index 700c1a7d5a..81746654a4 100644 --- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp @@ -22,6 +22,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace btg = batchedGemm::trtllm::gen; @@ -576,17 +578,20 @@ torch::Tensor shuffleMatrix(torch::Tensor matrix, torch::Tensor permuteIndices) } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP4BlockScaleMoERunner") + m.class_("FP4BlockScaleMoERunner") .def(torch::init<>()) - .def("get_valid_configs", &torch_ext::FP4BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::FP4BlockScaleMoeRunner::run); - m.class_("FP8FP4BlockScaleMoERunner") + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP4BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::FP4BlockScaleMoeRunner::run); + m.class_("FP8FP4BlockScaleMoERunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::FP8FP4BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::FP8FP4BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP8FP4BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::FP8FP4BlockScaleMoeRunner::run); } // Accepts both CPU and CUDA tensors -static auto shuffle_matrix = torch::RegisterOperators("trtllm::shuffle_matrix", &torch_ext::shuffleMatrix); +static auto shuffle_matrix + = torch::RegisterOperators("trtllm::shuffle_matrix", &tensorrt_llm::torch_ext::shuffleMatrix); diff --git a/cpp/tensorrt_llm/thop/fp4Gemm.cpp b/cpp/tensorrt_llm/thop/fp4Gemm.cpp index 2fa818bdee..9c33436dc0 100644 --- a/cpp/tensorrt_llm/thop/fp4Gemm.cpp +++ b/cpp/tensorrt_llm/thop/fp4Gemm.cpp @@ -47,6 +47,8 @@ using tensorrt_llm::kernels::internal_cutlass_kernels::CutlassFp4GemmRunner; using tensorrt_llm::kernels::internal_cutlass_kernels::CutlassFp4GemmRunnerInterface; #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -310,12 +312,14 @@ private: }; } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP4GemmRunner") + m.class_("FP4GemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::FP4GemmRunner::runGemm) - .def("get_num_configs", &torch_ext::FP4GemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::FP4GemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::FP4GemmRunner::getNumConfigs); m.def( "fp4_bmm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale, Tensor globalScale, int fp4GemmType, " @@ -327,6 +331,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_bmm", &torch_ext::fp4_bmm); - m.impl("fp4_gemm", &torch_ext::fp4_bmm); + m.impl("fp4_bmm", &tensorrt_llm::torch_ext::fp4_bmm); + m.impl("fp4_gemm", &tensorrt_llm::torch_ext::fp4_bmm); } diff --git a/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp index 6b923336d1..1c9ac017fb 100644 --- a/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -126,6 +128,8 @@ at::Tensor fp4_gemm_trtllmgen(at::Tensor const& mat1, at::Tensor const& mat2, at } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -136,5 +140,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_gemm_trtllmgen", &torch_ext::fp4_gemm_trtllmgen); + m.impl("fp4_gemm_trtllmgen", &tensorrt_llm::torch_ext::fp4_gemm_trtllmgen); } diff --git a/cpp/tensorrt_llm/thop/fp4Op.cpp b/cpp/tensorrt_llm/thop/fp4Op.cpp index 54746be1c7..abaf242858 100644 --- a/cpp/tensorrt_llm/thop/fp4Op.cpp +++ b/cpp/tensorrt_llm/thop/fp4Op.cpp @@ -27,6 +27,8 @@ namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -476,17 +478,19 @@ th::Tensor E2M1AndUFP8SFScaleToFloatV2(th::Tensor valueE2M1, th::Tensor scaleFP8 } // namespace torch_ext -static auto float_to_e2m1_and_ufp8sf_scale - = torch::RegisterOperators("tensorrt_llm::float_to_e2m1_and_ufp8sf_scale", &torch_ext::FloatToE2M1AndUFP8SFScale); +TRTLLM_NAMESPACE_END -static auto half_to_e2m1_and_ufp8sf_scale - = torch::RegisterOperators("tensorrt_llm::half_to_e2m1_and_ufp8sf_scale", &torch_ext::HalfToE2M1AndUFP8SFScale); +static auto float_to_e2m1_and_ufp8sf_scale = torch::RegisterOperators( + "tensorrt_llm::float_to_e2m1_and_ufp8sf_scale", &tensorrt_llm::torch_ext::FloatToE2M1AndUFP8SFScale); -static auto e2m1_and_ufp8sf_scale_to_float - = torch::RegisterOperators("tensorrt_llm::e2m1_and_ufp8sf_scale_to_float", &torch_ext::E2M1AndUFP8SFScaleToFloat); +static auto half_to_e2m1_and_ufp8sf_scale = torch::RegisterOperators( + "tensorrt_llm::half_to_e2m1_and_ufp8sf_scale", &tensorrt_llm::torch_ext::HalfToE2M1AndUFP8SFScale); + +static auto e2m1_and_ufp8sf_scale_to_float = torch::RegisterOperators( + "tensorrt_llm::e2m1_and_ufp8sf_scale_to_float", &tensorrt_llm::torch_ext::E2M1AndUFP8SFScaleToFloat); static auto e2m1_and_ufp8sf_scale_to_float_v2 = torch::RegisterOperators( - "tensorrt_llm::e2m1_and_ufp8sf_scale_to_float_v2", &torch_ext::E2M1AndUFP8SFScaleToFloatV2); + "tensorrt_llm::e2m1_and_ufp8sf_scale_to_float_v2", &tensorrt_llm::torch_ext::E2M1AndUFP8SFScaleToFloatV2); TORCH_LIBRARY_FRAGMENT(trtllm, m) { @@ -496,12 +500,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("block_scale_interleave", &torch_ext::BlockScaleInterleave); - m.impl("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse); + m.impl("block_scale_interleave", &tensorrt_llm::torch_ext::BlockScaleInterleave); + m.impl("block_scale_interleave_reverse", &tensorrt_llm::torch_ext::BlockScaleInterleaveReverse); } TORCH_LIBRARY_IMPL(trtllm, CPU, m) { - m.impl("block_scale_interleave", &torch_ext::BlockScaleInterleave); - m.impl("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse); + m.impl("block_scale_interleave", &tensorrt_llm::torch_ext::BlockScaleInterleave); + m.impl("block_scale_interleave_reverse", &tensorrt_llm::torch_ext::BlockScaleInterleaveReverse); } diff --git a/cpp/tensorrt_llm/thop/fp4Quantize.cpp b/cpp/tensorrt_llm/thop/fp4Quantize.cpp index a4d9b038bf..61745850c8 100644 --- a/cpp/tensorrt_llm/thop/fp4Quantize.cpp +++ b/cpp/tensorrt_llm/thop/fp4Quantize.cpp @@ -26,6 +26,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // self: [M, K], fp16/bf16/fp8_quantized @@ -232,6 +234,8 @@ at::Tensor calculate_nvfp4_global_scale(at::Tensor const& input, std::optional #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { std::tuple fp4_quantize(at::Tensor const& self, std::optional const& globalScale, @@ -29,3 +33,5 @@ std::tuple fp4_quantize(at::Tensor const& self, std::opt at::Tensor calculate_nvfp4_global_scale(at::Tensor const& input, std::optional const& tokensPerBatch); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp index 8ed81c4aa9..b657b92eb3 100644 --- a/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -113,6 +115,8 @@ at::Tensor fp4_fp8_gemm_trtllmgen(at::Tensor const& mat1, at::Tensor const& mat2 } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -122,5 +126,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_fp8_gemm_trtllmgen", &torch_ext::fp4_fp8_gemm_trtllmgen); + m.impl("fp4_fp8_gemm_trtllmgen", &tensorrt_llm::torch_ext::fp4_fp8_gemm_trtllmgen); } diff --git a/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp index be1970e480..f3da650a94 100644 --- a/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp @@ -173,6 +173,8 @@ std::tuple fp8_batched_gemm_sm100(at::Tensor const& mat1 } } // namespace +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -268,10 +270,12 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP8BatchedGemmRunner") + m.class_("FP8BatchedGemmRunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::FP8BatchedGemmRunner::getValidConfigs) - .def("run_batched_gemm", &torch_ext::FP8BatchedGemmRunner::runBatchedGemm); + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP8BatchedGemmRunner::getValidConfigs) + .def("run_batched_gemm", &tensorrt_llm::torch_ext::FP8BatchedGemmRunner::runBatchedGemm); } diff --git a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp index 42e55dc00c..b8e688d1d3 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp @@ -26,6 +26,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -395,10 +397,12 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP8BlockScaleMoERunner") + m.class_("FP8BlockScaleMoERunner") .def(torch::init<>()) - .def("get_valid_configs", &torch_ext::FP8BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::FP8BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP8BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::FP8BlockScaleMoeRunner::run); } diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp index cdea9d03fa..d6e65a2941 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" #include "tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h" @@ -26,6 +27,8 @@ using namespace tensorrt_llm::kernels::fp8_blockscale_gemm; using namespace tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -382,6 +385,8 @@ torch::Tensor fp8_block_scaling_bmm(torch::Tensor const& mat1, torch::Tensor con } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("fp8_block_scaling_gemm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale) -> Tensor"); @@ -398,8 +403,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_block_scaling_gemm", &torch_ext::fp8_block_scaling_gemm); - m.impl("fp8_block_scaling_bmm", &torch_ext::fp8_block_scaling_bmm); - m.impl("fp8_block_scaling_bmm_out", &torch_ext::fp8_block_scaling_bmm_out); - m.impl("fp8_block_scaling_moe_gemm", &torch_ext::fp8_block_scaling_moe_gemm); + m.impl("fp8_block_scaling_gemm", &tensorrt_llm::torch_ext::fp8_block_scaling_gemm); + m.impl("fp8_block_scaling_bmm", &tensorrt_llm::torch_ext::fp8_block_scaling_bmm); + m.impl("fp8_block_scaling_bmm_out", &tensorrt_llm::torch_ext::fp8_block_scaling_bmm_out); + m.impl("fp8_block_scaling_moe_gemm", &tensorrt_llm::torch_ext::fp8_block_scaling_moe_gemm); } diff --git a/cpp/tensorrt_llm/thop/fp8Op.cpp b/cpp/tensorrt_llm/thop/fp8Op.cpp index 21f56757c6..867fd3de0c 100644 --- a/cpp/tensorrt_llm/thop/fp8Op.cpp +++ b/cpp/tensorrt_llm/thop/fp8Op.cpp @@ -16,6 +16,7 @@ #include "tensorrt_llm/thop/fp8Op.h" #include "cutlass/numeric_types.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/thop/thUtils.h" @@ -26,6 +27,8 @@ #define TORCH_IS_AT_LEAST_v190 #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using torch::Tensor; @@ -370,6 +373,8 @@ Tensor symmetric_dequantize_per_tensor(Tensor input, Tensor scales) } // namespace torch_ext +TRTLLM_NAMESPACE_END + // Utility methods that may be useful for preprocessing weights in torch. TORCH_LIBRARY_FRAGMENT(tensorrt_llm, m) { @@ -386,19 +391,19 @@ TORCH_LIBRARY_FRAGMENT(tensorrt_llm, m) TORCH_LIBRARY_IMPL(tensorrt_llm, CUDA, m) { - m.impl("quantize_e4m3_weight", &torch_ext::symmetric_quantize_weight); - m.impl("quantize_e4m3_activation", &torch_ext::symmetric_quantize_activation); - m.impl("quantize_e4m3_per_tensor", &torch_ext::symmetric_quantize_per_tensor); - m.impl("static_quantize_e4m3_weight", &torch_ext::symmetric_static_quantize_weight); - m.impl("static_quantize_e4m3_activation", &torch_ext::symmetric_static_quantize_activation); - m.impl("static_quantize_e4m3_per_tensor", &torch_ext::symmetric_static_quantize_per_tensor); - m.impl("dequantize_e4m3_weight", &torch_ext::symmetric_dequantize_weight); - m.impl("dequantize_e4m3_activation", &torch_ext::symmetric_dequantize_activation); - m.impl("dequantize_e4m3_per_tensor", &torch_ext::symmetric_dequantize_per_tensor); + m.impl("quantize_e4m3_weight", &tensorrt_llm::torch_ext::symmetric_quantize_weight); + m.impl("quantize_e4m3_activation", &tensorrt_llm::torch_ext::symmetric_quantize_activation); + m.impl("quantize_e4m3_per_tensor", &tensorrt_llm::torch_ext::symmetric_quantize_per_tensor); + m.impl("static_quantize_e4m3_weight", &tensorrt_llm::torch_ext::symmetric_static_quantize_weight); + m.impl("static_quantize_e4m3_activation", &tensorrt_llm::torch_ext::symmetric_static_quantize_activation); + m.impl("static_quantize_e4m3_per_tensor", &tensorrt_llm::torch_ext::symmetric_static_quantize_per_tensor); + m.impl("dequantize_e4m3_weight", &tensorrt_llm::torch_ext::symmetric_dequantize_weight); + m.impl("dequantize_e4m3_activation", &tensorrt_llm::torch_ext::symmetric_dequantize_activation); + m.impl("dequantize_e4m3_per_tensor", &tensorrt_llm::torch_ext::symmetric_dequantize_per_tensor); } -static auto dequantize_mxe4m3_host - = torch::RegisterOperators("tensorrt_llm::dequantize_mxe4m3_host", &torch_ext::dequantize_mxe4m3_host); +static auto dequantize_mxe4m3_host = torch::RegisterOperators( + "tensorrt_llm::dequantize_mxe4m3_host", &tensorrt_llm::torch_ext::dequantize_mxe4m3_host); static auto quantize_mxe4m3_host - = torch::RegisterOperators("tensorrt_llm::quantize_mxe4m3_host", &torch_ext::quantize_mxe4m3_host); + = torch::RegisterOperators("tensorrt_llm::quantize_mxe4m3_host", &tensorrt_llm::torch_ext::quantize_mxe4m3_host); diff --git a/cpp/tensorrt_llm/thop/fp8Op.h b/cpp/tensorrt_llm/thop/fp8Op.h index 1b08935d1d..1a9955c4d5 100644 --- a/cpp/tensorrt_llm/thop/fp8Op.h +++ b/cpp/tensorrt_llm/thop/fp8Op.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/thop/thUtils.h" @@ -26,6 +27,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // Given the rowIdx and colIdx in the unswizzled SFMatrix, compute the 1D offset in the swizzled SFMatrix. @@ -83,3 +86,5 @@ torch::Tensor symmetric_dequantize_activation(torch::Tensor activation, torch::T torch::Tensor symmetric_dequantize_per_tensor(torch::Tensor input, torch::Tensor scales); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp index a1794d6c2f..9681be6e7a 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp @@ -19,6 +19,8 @@ #include "tensorrt_llm/thop/thUtils.h" #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -310,6 +312,8 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::optional con } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -339,5 +343,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_per_tensor_scale_moe_runner", &torch_ext::fp8_per_tensor_scale_moe_runner); + m.impl("fp8_per_tensor_scale_moe_runner", &tensorrt_llm::torch_ext::fp8_per_tensor_scale_moe_runner); } diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp index 5c66eaf4f6..7f044a198e 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -161,6 +163,8 @@ torch::Tensor fp8_per_tensor_scaling_tllmg_gemm(torch::Tensor const& mat1, torch } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -170,5 +174,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_per_tensor_scaling_tllmg_gemm", &torch_ext::fp8_per_tensor_scaling_tllmg_gemm); + m.impl("fp8_per_tensor_scaling_tllmg_gemm", &tensorrt_llm::torch_ext::fp8_per_tensor_scaling_tllmg_gemm); } diff --git a/cpp/tensorrt_llm/thop/fp8Quantize.cpp b/cpp/tensorrt_llm/thop/fp8Quantize.cpp index 7b0f86c47b..91746a321b 100644 --- a/cpp/tensorrt_llm/thop/fp8Quantize.cpp +++ b/cpp/tensorrt_llm/thop/fp8Quantize.cpp @@ -20,6 +20,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -135,6 +137,8 @@ std::tuple fp8_batched_quantize_1x128_permute102(at::Ten } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("fp8_quantize_1x128(Tensor input, bool use_ue8m0=False) -> (Tensor, Tensor)"); @@ -143,6 +147,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_quantize_1x128", &torch_ext::fp8_quantize_1x128); - m.impl("fp8_batched_quantize_1x128_permute102", &torch_ext::fp8_batched_quantize_1x128_permute102); + m.impl("fp8_quantize_1x128", &tensorrt_llm::torch_ext::fp8_quantize_1x128); + m.impl("fp8_batched_quantize_1x128_permute102", &tensorrt_llm::torch_ext::fp8_batched_quantize_1x128_permute102); } diff --git a/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp b/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp index 97a05a568c..a90795badf 100644 --- a/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp @@ -34,6 +34,8 @@ using tensorrt_llm::kernels::cutlass_kernels::CutlassFp8RowwiseGemmRunner; using tensorrt_llm::kernels::cutlass_kernels::CutlassFp8RowwiseGemmRunnerInterface; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -183,10 +185,12 @@ private: }; } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP8RowwiseGemmRunner") + m.class_("FP8RowwiseGemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::FP8RowwiseGemmRunner::runGemm) - .def("get_num_configs", &torch_ext::FP8RowwiseGemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::FP8RowwiseGemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::FP8RowwiseGemmRunner::getNumConfigs); } diff --git a/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp b/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp index 20225ab71c..14bf8578dc 100644 --- a/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp +++ b/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp @@ -20,6 +20,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -94,3 +96,5 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m) } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp b/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp index 6b6e0edc7c..0974b30f43 100644 --- a/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp +++ b/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -56,6 +58,8 @@ std::tuple fused_topk_softmax(torch::Tensor const& } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -66,5 +70,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fused_topk_softmax", &torch_ext::fused_topk_softmax); + m.impl("fused_topk_softmax", &tensorrt_llm::torch_ext::fused_topk_softmax); } diff --git a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp index e951830768..45f2649a6a 100644 --- a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp +++ b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp @@ -24,6 +24,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -135,4 +137,6 @@ th::Tensor gatherTree( // BS: batch_size, BM: } // namespace torch_ext -static auto gather_tree = torch::RegisterOperators("tensorrt_llm::gather_tree", &torch_ext::gatherTree); +TRTLLM_NAMESPACE_END + +static auto gather_tree = torch::RegisterOperators("tensorrt_llm::gather_tree", &tensorrt_llm::torch_ext::gatherTree); diff --git a/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp b/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp index 4cdffe6363..c408a8c286 100644 --- a/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp +++ b/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp @@ -28,6 +28,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -280,10 +282,12 @@ void groupRMSNormHeuristic(torch::TensorList const& inputs, torch::TensorList co } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("group_rms_norm_base", &torch_ext::groupRMSNormBase); - m.impl("group_rms_norm_large_batch", &torch_ext::groupRMSNormLargeBatch); + m.impl("group_rms_norm_base", &tensorrt_llm::torch_ext::groupRMSNormBase); + m.impl("group_rms_norm_large_batch", &tensorrt_llm::torch_ext::groupRMSNormLargeBatch); // Use groupRMSNormHeuristic which automatically selects between regular and large batch kernels - m.impl("group_rms_norm_heuristic", &torch_ext::groupRMSNormHeuristic); + m.impl("group_rms_norm_heuristic", &tensorrt_llm::torch_ext::groupRMSNormHeuristic); } diff --git a/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp b/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp index 90a70c5edf..f8425cbade 100644 --- a/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp +++ b/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp @@ -21,6 +21,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -108,3 +110,5 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m) } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/llama4MinLatency.cpp b/cpp/tensorrt_llm/thop/llama4MinLatency.cpp index 53873e3d27..6737ca0dfd 100644 --- a/cpp/tensorrt_llm/thop/llama4MinLatency.cpp +++ b/cpp/tensorrt_llm/thop/llama4MinLatency.cpp @@ -33,6 +33,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -210,10 +212,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("llama4_bf16_bf16_gemm", &torch_ext::llama4_bf16_bf16_gemm); - m.impl("llama4_fp8_bf16_gemm", &torch_ext::llama4_fp8_bf16_gemm); - m.impl("llama4_fp8_fp8_gemm_swiglu", &torch_ext::llama4_fp8_fp8_gemm_swiglu); - m.impl("llama4_moe_tp8ep1_min_latency", &torch_ext::llama4_moe_tp8ep1_min_latency); + m.impl("llama4_bf16_bf16_gemm", &tensorrt_llm::torch_ext::llama4_bf16_bf16_gemm); + m.impl("llama4_fp8_bf16_gemm", &tensorrt_llm::torch_ext::llama4_fp8_bf16_gemm); + m.impl("llama4_fp8_fp8_gemm_swiglu", &tensorrt_llm::torch_ext::llama4_fp8_fp8_gemm_swiglu); + m.impl("llama4_moe_tp8ep1_min_latency", &tensorrt_llm::torch_ext::llama4_moe_tp8ep1_min_latency); } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp b/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp index 0a3fa76ff6..2f6eddd5ca 100644 --- a/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp +++ b/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp @@ -18,6 +18,8 @@ #include "tensorrt_llm/kernels/logitsBitmask.h" #include "tensorrt_llm/thop/thUtils.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -95,6 +97,8 @@ void logitsBitmask(torch::Tensor const& logits, torch::Tensor const& bitmask, } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("logits_bitmask(Tensor(a!) logits, Tensor bitmask, Tensor? token_mask=None, Tensor? d2t=None) -> ()"); @@ -102,5 +106,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("logits_bitmask", &torch_ext::logitsBitmask); + m.impl("logits_bitmask", &tensorrt_llm::torch_ext::logitsBitmask); } diff --git a/cpp/tensorrt_llm/thop/loraOp.cpp b/cpp/tensorrt_llm/thop/loraOp.cpp index 379e7cf43c..08cf10decf 100644 --- a/cpp/tensorrt_llm/thop/loraOp.cpp +++ b/cpp/tensorrt_llm/thop/loraOp.cpp @@ -26,6 +26,8 @@ namespace th = torch; namespace tk = tensorrt_llm::kernels; using tensorrt_llm::common::fmtstr; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -174,6 +176,8 @@ std::vector lora_grouped_gemm(th::Tensor const& input, th::Tensor co } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -192,5 +196,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("lora_grouped_gemm", &torch_ext::lora_grouped_gemm); + m.impl("lora_grouped_gemm", &tensorrt_llm::torch_ext::lora_grouped_gemm); } diff --git a/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp b/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp index f1933ae3cd..81f5a9ac8b 100644 --- a/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp +++ b/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp @@ -21,6 +21,8 @@ namespace th = torch; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -173,6 +175,8 @@ std::tuple mamba_conv1d(th::Tensor const& input, th::Ten } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -187,5 +191,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mamba_conv1d", &torch_ext::mamba_conv1d); + m.impl("mamba_conv1d", &tensorrt_llm::torch_ext::mamba_conv1d); } diff --git a/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp b/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp index 6dfffec54d..171f0d1522 100644 --- a/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp +++ b/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp @@ -28,6 +28,8 @@ namespace tk = tensorrt_llm::kernels; namespace tc = tensorrt_llm::common; using tk::KVBlockArray; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -468,6 +470,8 @@ void mergeChunkedAttentionForMLA(torch::Tensor& merged_attn, torch::Tensor const } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -496,7 +500,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("load_paged_kv_cache_for_mla", &torch_ext::loadPagedKVCacheForMLA); + m.impl("load_paged_kv_cache_for_mla", &tensorrt_llm::torch_ext::loadPagedKVCacheForMLA); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -527,7 +531,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("load_chunked_kv_cache_for_mla", &torch_ext::loadChunkedKVCacheForMLA); + m.impl("load_chunked_kv_cache_for_mla", &tensorrt_llm::torch_ext::loadChunkedKVCacheForMLA); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -562,7 +566,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mla_rope_append_paged_kv_assign_q", &torch_ext::MLARopeAppendPagedKVAssignQ); + m.impl("mla_rope_append_paged_kv_assign_q", &tensorrt_llm::torch_ext::MLARopeAppendPagedKVAssignQ); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -584,5 +588,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("merge_chunked_attention_for_mla", &torch_ext::mergeChunkedAttentionForMLA); + m.impl("merge_chunked_attention_for_mla", &tensorrt_llm::torch_ext::mergeChunkedAttentionForMLA); } diff --git a/cpp/tensorrt_llm/thop/moeAlignOp.cpp b/cpp/tensorrt_llm/thop/moeAlignOp.cpp index b12b7fc401..d28b9261af 100644 --- a/cpp/tensorrt_llm/thop/moeAlignOp.cpp +++ b/cpp/tensorrt_llm/thop/moeAlignOp.cpp @@ -14,12 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/moeAlignKernels.h" #include "thUtils.h" #include namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -46,6 +49,8 @@ void moeAlignBlockSizeOp(torch::Tensor topk_ids, int64_t num_experts, int64_t bl } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -55,5 +60,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_align_block_size", &torch_ext::moeAlignBlockSizeOp); + m.impl("moe_align_block_size", &tensorrt_llm::torch_ext::moeAlignBlockSizeOp); } diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h b/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h index ef37af4bc1..d8634e6a4f 100644 --- a/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h +++ b/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h @@ -16,11 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace moe_comm @@ -63,3 +67,5 @@ inline std::vector> getMoeA2AMetaInfoIndexPairs( } // namespace moe_comm } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp index 2a74f36457..e11135ddfb 100644 --- a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp +++ b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp @@ -25,6 +25,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -521,6 +523,8 @@ int64_t moeA2AGetAuxDataSizeOp(int64_t epSize, int64_t maxNumTokens) } // namespace torch_ext +TRTLLM_NAMESPACE_END + // PyTorch bindings TORCH_LIBRARY_FRAGMENT(trtllm, module) { @@ -546,14 +550,15 @@ TORCH_LIBRARY_FRAGMENT(trtllm, module) "runtime_max_tokens_per_rank, " "int combine_payload_offset, ScalarType out_dtype, int hidden_size) -> Tensor(a)"); module.def("moe_a2a_get_aux_data_size(int ep_size, int max_num_tokens) -> int", - &torch_ext::moe_comm::moeA2AGetAuxDataSizeOp); + &tensorrt_llm::torch_ext::moe_comm::moeA2AGetAuxDataSizeOp); } TORCH_LIBRARY_IMPL(trtllm, CUDA, module) { - module.impl("moe_a2a_dispatch", &torch_ext::moe_comm::moeA2ADispatchOp); - module.impl("moe_a2a_combine", &torch_ext::moe_comm::moeA2ACombineOp); - module.impl("moe_a2a_initialize", &torch_ext::moe_comm::moeA2AInitializeOp); - module.impl("moe_a2a_sanitize_expert_ids", &torch_ext::moe_comm::moeA2ASanitizeExpertIdsOp); - module.impl("moe_a2a_get_combine_payload_tensor", &torch_ext::moe_comm::moeA2AGetCombinePayloadTensorOp); + module.impl("moe_a2a_dispatch", &tensorrt_llm::torch_ext::moe_comm::moeA2ADispatchOp); + module.impl("moe_a2a_combine", &tensorrt_llm::torch_ext::moe_comm::moeA2ACombineOp); + module.impl("moe_a2a_initialize", &tensorrt_llm::torch_ext::moe_comm::moeA2AInitializeOp); + module.impl("moe_a2a_sanitize_expert_ids", &tensorrt_llm::torch_ext::moe_comm::moeA2ASanitizeExpertIdsOp); + module.impl( + "moe_a2a_get_combine_payload_tensor", &tensorrt_llm::torch_ext::moe_comm::moeA2AGetCombinePayloadTensorOp); } diff --git a/cpp/tensorrt_llm/thop/moeCommOp.cpp b/cpp/tensorrt_llm/thop/moeCommOp.cpp index af8ed85b5b..aaf5255b39 100644 --- a/cpp/tensorrt_llm/thop/moeCommOp.cpp +++ b/cpp/tensorrt_llm/thop/moeCommOp.cpp @@ -25,6 +25,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -249,6 +251,8 @@ void memsetExpertIds(torch::Tensor expertsIds, torch::Tensor recvRankCountCumSum } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -259,7 +263,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_comm", &torch_ext::moeCommOp); + m.impl("moe_comm", &tensorrt_llm::torch_ext::moeCommOp); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -269,7 +273,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_initialize_workspace", &torch_ext::initializeMoeWorkspace); + m.impl("moe_initialize_workspace", &tensorrt_llm::torch_ext::initializeMoeWorkspace); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -279,7 +283,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("get_moe_commworkspace_size_per_rank", &torch_ext::getWorkspaceSizePerRank); + m.impl("get_moe_commworkspace_size_per_rank", &tensorrt_llm::torch_ext::getWorkspaceSizePerRank); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -289,7 +293,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("set_moe_max_usable_sm_count", &torch_ext::setMaxUsableSmCount); + m.impl("set_moe_max_usable_sm_count", &tensorrt_llm::torch_ext::setMaxUsableSmCount); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -302,7 +306,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mnnvl_moe_alltoallv_prepare_without_allgather", &torch_ext::moePrepareOp); + m.impl("mnnvl_moe_alltoallv_prepare_without_allgather", &tensorrt_llm::torch_ext::moePrepareOp); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -315,7 +319,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("memset_expert_ids", &torch_ext::memsetExpertIds); + m.impl("memset_expert_ids", &tensorrt_llm::torch_ext::memsetExpertIds); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -325,5 +329,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("get_moe_prepare_workspace_size_per_rank", &torch_ext::getPrepareWorkspaceSizePerRank); + m.impl("get_moe_prepare_workspace_size_per_rank", &tensorrt_llm::torch_ext::getPrepareWorkspaceSizePerRank); } diff --git a/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp b/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp index 4cc7bbd4b3..aacf3a62e9 100644 --- a/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp +++ b/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp @@ -29,6 +29,8 @@ #include "tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.h" #include "tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -200,6 +202,8 @@ void migrateToHostAccessible(at::Tensor& tensor) } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("moe_load_balance_wait_gpu_stage(int single_layer_load_balancer_ptr) -> Tensor"); @@ -207,7 +211,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("moe_load_balance_wait_gpu_stage", &torch_ext::moeLoadBalanceWaitGpuStage); + m.impl("moe_load_balance_wait_gpu_stage", &tensorrt_llm::torch_ext::moeLoadBalanceWaitGpuStage); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -217,7 +221,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("moe_load_balance_set_cpu_stage", &torch_ext::moeLoadBalanceSetCpuStage); + m.impl("moe_load_balance_set_cpu_stage", &tensorrt_llm::torch_ext::moeLoadBalanceSetCpuStage); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -229,7 +233,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_load_balance_statistic", &torch_ext::moeLoadBalanceStatistic); + m.impl("moe_load_balance_statistic", &tensorrt_llm::torch_ext::moeLoadBalanceStatistic); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -242,7 +246,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_hierarchical_statistic_local_device", &torch_ext::moeHierarchicalStatisticLocalDevice); + m.impl("moe_hierarchical_statistic_local_device", &tensorrt_llm::torch_ext::moeHierarchicalStatisticLocalDevice); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -254,7 +258,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_hierarchical_statistic_update", &torch_ext::moeHierarchicalStatisticUpdate); + m.impl("moe_hierarchical_statistic_update", &tensorrt_llm::torch_ext::moeHierarchicalStatisticUpdate); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -266,7 +270,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_load_balance_routing", &torch_ext::moeLoadBalanceRouting); + m.impl("moe_load_balance_routing", &tensorrt_llm::torch_ext::moeLoadBalanceRouting); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -276,5 +280,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("migrate_to_host_accessible", &torch_ext::migrateToHostAccessible); + m.impl("migrate_to_host_accessible", &tensorrt_llm::torch_ext::migrateToHostAccessible); } diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp index 953de1c58f..ae62b0a32e 100644 --- a/cpp/tensorrt_llm/thop/moeOp.cpp +++ b/cpp/tensorrt_llm/thop/moeOp.cpp @@ -23,6 +23,7 @@ // Always include the public header for moe_gemm_kernels.h #include "tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/workspace.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" #include "tensorrt_llm/kernels/cutlass_kernels/include/cutlass_kernel_selector.h" @@ -42,6 +43,8 @@ C10_THROW_ERROR(ErrorType, oss.str()); \ } while (0) +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -234,6 +237,7 @@ public: mProfiler = std::make_shared(); mGemm1Profiles = mKernelRunner->getTactics(MoeGemmId::GEMM_1); mGemm2Profiles = mKernelRunner->getTactics(MoeGemmId::GEMM_2); + cuInit(0); } ~FusedMoeRunner() @@ -1193,12 +1197,14 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY(trtllm, m) { - m.class_("FusedMoeRunner") + m.class_("FusedMoeRunner") .def(torch::init()) - .def("run_gemm_profile", &torch_ext::FusedMoeRunner::runGemmProfile) - .def("get_tactic_num", &torch_ext::FusedMoeRunner::getTacticNum) - .def("run_moe", &torch_ext::FusedMoeRunner::runMoe) - .def("run_moe_min_latency", &torch_ext::FusedMoeRunner::runMoeMinLantency); + .def("run_gemm_profile", &tensorrt_llm::torch_ext::FusedMoeRunner::runGemmProfile) + .def("get_tactic_num", &tensorrt_llm::torch_ext::FusedMoeRunner::getTacticNum) + .def("run_moe", &tensorrt_llm::torch_ext::FusedMoeRunner::runMoe) + .def("run_moe_min_latency", &tensorrt_llm::torch_ext::FusedMoeRunner::runMoeMinLantency); } diff --git a/cpp/tensorrt_llm/thop/moeUtilOp.cpp b/cpp/tensorrt_llm/thop/moeUtilOp.cpp index cd1f327066..c11fe1703b 100644 --- a/cpp/tensorrt_llm/thop/moeUtilOp.cpp +++ b/cpp/tensorrt_llm/thop/moeUtilOp.cpp @@ -32,6 +32,8 @@ namespace common = tensorrt_llm::common; namespace kernels = tensorrt_llm::kernels; namespace cutlass_kernels = tensorrt_llm::kernels::cutlass_kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -329,6 +331,8 @@ torch::Tensor run_moe_finalize_scale_op(torch::Tensor const& gemm2_output, torch } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -347,6 +351,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_permute_op", &torch_ext::moe_permute_op); - m.impl("moe_finalize_scale_op", &torch_ext::run_moe_finalize_scale_op); + m.impl("moe_permute_op", &tensorrt_llm::torch_ext::moe_permute_op); + m.impl("moe_finalize_scale_op", &tensorrt_llm::torch_ext::run_moe_finalize_scale_op); } diff --git a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp index 2fdc8573cf..087871593e 100644 --- a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp @@ -25,6 +25,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace btg = batchedGemm::trtllm::gen; @@ -664,16 +666,18 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + // Accepts CUDA tensor only TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("Bf16MxE2m1BlockScaleMoERunner") + m.class_("Bf16MxE2m1BlockScaleMoERunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::Bf16MxE2m1BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::Bf16MxE2m1BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::Bf16MxE2m1BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::Bf16MxE2m1BlockScaleMoeRunner::run); - m.class_("MxE4m3MxE2m1BlockScaleMoERunner") + m.class_("MxE4m3MxE2m1BlockScaleMoERunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::run); } diff --git a/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp b/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp index ba651f2886..306e09e1c1 100644 --- a/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp +++ b/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp @@ -24,6 +24,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // self: [M, K], fp16/bf16/fp8_quantized @@ -102,6 +104,8 @@ std::tuple mxfp8_quantize( } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -111,5 +115,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mxfp8_quantize", &torch_ext::mxfp8_quantize); + m.impl("mxfp8_quantize", &tensorrt_llm::torch_ext::mxfp8_quantize); } diff --git a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp index 22a33e27b2..75ae96f36b 100644 --- a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp +++ b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp @@ -20,6 +20,8 @@ namespace tr = tensorrt_llm::runtime; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -47,7 +49,10 @@ void NcclCommunicatorOp::recv(th::Tensor& tensor, int64_t fromRank) const } // namespace torch_ext -static auto trtllmNcclCommunicator = torch::jit::class_("trtllm", "NcclCommunicatorOp") - .def(torch::jit::init()) - .def("send", &torch_ext::NcclCommunicatorOp::send) - .def("recv", &torch_ext::NcclCommunicatorOp::recv); +TRTLLM_NAMESPACE_END + +static auto trtllmNcclCommunicator + = torch::jit::class_("trtllm", "NcclCommunicatorOp") + .def(torch::jit::init()) + .def("send", &tensorrt_llm::torch_ext::NcclCommunicatorOp::send) + .def("recv", &tensorrt_llm::torch_ext::NcclCommunicatorOp::recv); diff --git a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h old mode 100755 new mode 100644 index 4cf376c0ef..38f4d215ac --- a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h +++ b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h @@ -15,12 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/ncclCommunicator.h" #include "tensorrt_llm/thop/thUtils.h" #include namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -38,3 +41,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/noAuxTcOp.cpp b/cpp/tensorrt_llm/thop/noAuxTcOp.cpp index 0804fb96b9..e445206e1d 100644 --- a/cpp/tensorrt_llm/thop/noAuxTcOp.cpp +++ b/cpp/tensorrt_llm/thop/noAuxTcOp.cpp @@ -32,6 +32,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { std::tuple noaux_tc_op(th::Tensor const& scores, th::Tensor const& bias, int64_t n_group, @@ -157,6 +159,8 @@ std::tuple noaux_tc_op(th::Tensor const& scores, th::Ten } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -166,5 +170,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("noaux_tc_op", &torch_ext::noaux_tc_op); + m.impl("noaux_tc_op", &tensorrt_llm::torch_ext::noaux_tc_op); } diff --git a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp index 4c7b3d733a..400cf81033 100644 --- a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp +++ b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp @@ -23,6 +23,8 @@ namespace th = torch; namespace tksd = tensorrt_llm::kernels::speculative_decoding; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -113,5 +115,7 @@ void updateKVCacheDraftTokenLocation(torch::Tensor seqAcceptedDraftTokenOffsetsT } // namespace torch_ext +TRTLLM_NAMESPACE_END + static auto update_kv_cache_draft_token_location = torch::RegisterOperators( - "tensorrt_llm::update_kv_cache_draft_token_location", &torch_ext::updateKVCacheDraftTokenLocation); + "tensorrt_llm::update_kv_cache_draft_token_location", &tensorrt_llm::torch_ext::updateKVCacheDraftTokenLocation); diff --git a/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp b/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp index 7ff79e0c22..d72622b6c8 100644 --- a/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp +++ b/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp @@ -35,6 +35,8 @@ namespace tr = tensorrt_llm::runtime; namespace tk = tensorrt_llm::kernels; namespace tksd = tensorrt_llm::kernels::speculative_decoding; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -163,5 +165,7 @@ void prepareRandomTensors(th::Tensor& curandState, // [maxBatchSize, 48], uint8_ } // namespace torch_ext -static auto redrafter_prepare_random_tensors - = torch::RegisterOperators("tensorrt_llm::redrafter_prepare_random_tensors", &torch_ext::prepareRandomTensors); +TRTLLM_NAMESPACE_END + +static auto redrafter_prepare_random_tensors = torch::RegisterOperators( + "tensorrt_llm::redrafter_prepare_random_tensors", &tensorrt_llm::torch_ext::prepareRandomTensors); diff --git a/cpp/tensorrt_llm/thop/reducescatterOp.cpp b/cpp/tensorrt_llm/thop/reducescatterOp.cpp index a8f1d93ee1..40f89e40ff 100644 --- a/cpp/tensorrt_llm/thop/reducescatterOp.cpp +++ b/cpp/tensorrt_llm/thop/reducescatterOp.cpp @@ -34,6 +34,8 @@ using tensorrt_llm::pg_utils::PgHelper; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { #if ENABLE_MULTI_DEVICE @@ -287,6 +289,8 @@ extern std::vector reducescatter_list_pg(torch::TensorList input_ } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("reducescatter(Tensor input, SymInt[]? sizes, int[] group) -> Tensor"); @@ -301,8 +305,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("reducescatter", &torch_ext::reducescatter); - m.impl("reducescatter_pg", &torch_ext::reducescatter_pg); - m.impl("reducescatter_list", &torch_ext::reducescatter_list); - m.impl("reducescatter_list_pg", &torch_ext::reducescatter_list_pg); + m.impl("reducescatter", &tensorrt_llm::torch_ext::reducescatter); + m.impl("reducescatter_pg", &tensorrt_llm::torch_ext::reducescatter_pg); + m.impl("reducescatter_list", &tensorrt_llm::torch_ext::reducescatter_list); + m.impl("reducescatter_list_pg", &tensorrt_llm::torch_ext::reducescatter_list_pg); } diff --git a/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp b/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp index b2b3f366a3..36306ac815 100644 --- a/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp +++ b/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp @@ -21,6 +21,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -68,5 +70,7 @@ void buildRelativeAttentionBias( } // namespace torch_ext -static auto relative_attention_bias - = torch::RegisterOperators("tensorrt_llm::relative_attention_bias", &torch_ext::buildRelativeAttentionBias); +TRTLLM_NAMESPACE_END + +static auto relative_attention_bias = torch::RegisterOperators( + "tensorrt_llm::relative_attention_bias", &tensorrt_llm::torch_ext::buildRelativeAttentionBias); diff --git a/cpp/tensorrt_llm/thop/selectiveScanOp.cpp b/cpp/tensorrt_llm/thop/selectiveScanOp.cpp index 46bcfda217..4414a3ce5d 100644 --- a/cpp/tensorrt_llm/thop/selectiveScanOp.cpp +++ b/cpp/tensorrt_llm/thop/selectiveScanOp.cpp @@ -21,6 +21,8 @@ namespace th = torch; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -244,6 +246,8 @@ std::tuple selective_scan(th::Tensor const& input, th::T } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -259,5 +263,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("selective_scan", &torch_ext::selective_scan); + m.impl("selective_scan", &tensorrt_llm::torch_ext::selective_scan); } diff --git a/cpp/tensorrt_llm/thop/specDecOp.cpp b/cpp/tensorrt_llm/thop/specDecOp.cpp index c68c08e29e..5f4111574e 100644 --- a/cpp/tensorrt_llm/thop/specDecOp.cpp +++ b/cpp/tensorrt_llm/thop/specDecOp.cpp @@ -15,6 +15,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h" @@ -25,6 +26,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -336,6 +339,8 @@ void extract_real_draft_tokens_op(th::Tensor newDraftTokens, th::Tensor draftTok } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -348,7 +353,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_prepare_drafter_inputs_op", &torch_ext::mtp_prepare_drafter_inputs_op); + m.impl("mtp_prepare_drafter_inputs_op", &tensorrt_llm::torch_ext::mtp_prepare_drafter_inputs_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -363,7 +368,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_sampling_and_accepted_draft_tokens_op", &torch_ext::mtp_sampling_and_accepted_draft_tokens_op); + m.impl("mtp_sampling_and_accepted_draft_tokens_op", + &tensorrt_llm::torch_ext::mtp_sampling_and_accepted_draft_tokens_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -378,7 +384,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_update_hidden_states_op", &torch_ext::mtp_update_hidden_states_op); + m.impl("mtp_update_hidden_states_op", &tensorrt_llm::torch_ext::mtp_update_hidden_states_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -394,7 +400,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_relaxed_acceptance_op", &torch_ext::mtp_relaxed_acceptance_op); + m.impl("mtp_relaxed_acceptance_op", &tensorrt_llm::torch_ext::mtp_relaxed_acceptance_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -409,5 +415,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("extract_real_draft_tokens_op", &torch_ext::extract_real_draft_tokens_op); + m.impl("extract_real_draft_tokens_op", &tensorrt_llm::torch_ext::extract_real_draft_tokens_op); } diff --git a/cpp/tensorrt_llm/thop/thUtils.cpp b/cpp/tensorrt_llm/thop/thUtils.cpp index 5c81856999..97fe6acaab 100644 --- a/cpp/tensorrt_llm/thop/thUtils.cpp +++ b/cpp/tensorrt_llm/thop/thUtils.cpp @@ -18,6 +18,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -111,3 +113,5 @@ cudaDataType_t convert_torch_dtype(torch::ScalarType dtype) } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/thUtils.h b/cpp/tensorrt_llm/thop/thUtils.h index 3ca6701ee2..04ec60e007 100644 --- a/cpp/tensorrt_llm/thop/thUtils.h +++ b/cpp/tensorrt_llm/thop/thUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/iTensor.h" #include @@ -54,6 +55,8 @@ #define PRINT_TENSOR(x) std::cout << #x << ":\n" << x << std::endl #define PRINT_TENSOR_SIZE(x) std::cout << "size of " << #x << ": " << x.sizes() << std::endl +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -93,3 +96,5 @@ std::optional getFloatEnv(char const* name); cudaDataType_t convert_torch_dtype(torch::ScalarType dtype); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/tinygemm2.cpp b/cpp/tensorrt_llm/thop/tinygemm2.cpp index 3be0bea04b..b617a65812 100644 --- a/cpp/tensorrt_llm/thop/tinygemm2.cpp +++ b/cpp/tensorrt_llm/thop/tinygemm2.cpp @@ -26,6 +26,8 @@ torch::Tensor tinygemm2_cuda_forward(torch::Tensor input, torch::Tensor weight, torch::Tensor bias); // C++ interface +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { torch::Tensor tinygemm2_forward(torch::Tensor input, torch::Tensor weight, torch::Tensor bias) @@ -45,6 +47,8 @@ torch::Tensor tinygemm2_forward(torch::Tensor input, torch::Tensor weight, torch } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("tinygemm2(Tensor input, Tensor weight, Tensor bias) -> Tensor"); @@ -52,5 +56,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("tinygemm2", &torch_ext::tinygemm2_forward); + m.impl("tinygemm2", &tensorrt_llm::torch_ext::tinygemm2_forward); } diff --git a/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp b/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp index f29ea57e71..3857259b2b 100644 --- a/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp +++ b/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp @@ -34,7 +34,7 @@ torch::Tensor userbuffers_allreduce_finalize(torch::Tensor input, bool force_app int hidden_size = input.size(-1); auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance(); - auto [output, ub_buffer] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type()); + auto [output, ub_buffer] = tensorrt_llm::torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type()); auto const dtype = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type()); diff --git a/cpp/tensorrt_llm/thop/userbuffersTensor.cpp b/cpp/tensorrt_llm/thop/userbuffersTensor.cpp index 4318f38bcd..47c1ea6998 100644 --- a/cpp/tensorrt_llm/thop/userbuffersTensor.cpp +++ b/cpp/tensorrt_llm/thop/userbuffersTensor.cpp @@ -15,6 +15,8 @@ */ #include "userbuffersTensor.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -46,7 +48,9 @@ torch::Tensor create_userbuffers_tensor_op(at::IntArrayRef shape, torch::ScalarT } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.def("create_userbuffers_tensor", &torch_ext::create_userbuffers_tensor_op); + m.def("create_userbuffers_tensor", &tensorrt_llm::torch_ext::create_userbuffers_tensor_op); } diff --git a/cpp/tensorrt_llm/thop/userbuffersTensor.h b/cpp/tensorrt_llm/thop/userbuffersTensor.h index 86c634c7ff..861c3e6620 100644 --- a/cpp/tensorrt_llm/thop/userbuffersTensor.h +++ b/cpp/tensorrt_llm/thop/userbuffersTensor.h @@ -15,9 +15,12 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h" #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -25,3 +28,5 @@ std::pair create_userbuffers at::IntArrayRef shape, torch::ScalarType dtype); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp index a00b51e16e..b8cfac19a8 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp @@ -15,6 +15,7 @@ */ #include "weightOnlyQuantGemm.h" #include "cutlass/numeric_types.h" +#include "tensorrt_llm/common/config.h" #include #include @@ -22,6 +23,8 @@ using namespace tensorrt_llm::kernels::cutlass_kernels; using namespace tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -156,10 +159,12 @@ int64_t WeightOnlyQuantGemmRunner::getNumConfigs() const } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("WeightOnlyQuantGemmRunner") + m.class_("WeightOnlyQuantGemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::WeightOnlyQuantGemmRunner::runGemm) - .def("get_num_configs", &torch_ext::WeightOnlyQuantGemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::WeightOnlyQuantGemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::WeightOnlyQuantGemmRunner::getNumConfigs); } diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h index df062d79a5..0b08b51b36 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h @@ -18,6 +18,7 @@ #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" @@ -29,6 +30,8 @@ using namespace tensorrt_llm::kernels::cutlass_kernels; using namespace tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using WeightOnlyQuantGemmRunnerPtr = std::shared_ptr; @@ -51,3 +54,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp index b6feba15e6..89c3312b9b 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h" #include "tensorrt_llm/thop/thUtils.h" @@ -23,6 +24,8 @@ #define TORCH_IS_AT_LEAST_v190 #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using torch::Tensor; @@ -400,35 +403,38 @@ Tensor mxfp4_dequantize_unswizzled(Tensor weight, Tensor scale, int64_t group_si } // namespace torch_ext +TRTLLM_NAMESPACE_END + // Utility methods that may be useful for preprocessing weights in torch. static auto symmetric_quantize_last_axis_of_batched_matrix = torch::RegisterOperators("trtllm::symmetric_quantize_last_axis_of_batched_matrix", - &torch_ext::symmetric_quantize_last_axis_of_batched_matrix); + &tensorrt_llm::torch_ext::symmetric_quantize_last_axis_of_batched_matrix); static auto preprocess_weights_for_mixed_gemm = torch::RegisterOperators( - "trtllm::preprocess_weights_for_mixed_gemm", &torch_ext::preprocess_weights_for_mixed_gemm); + "trtllm::preprocess_weights_for_mixed_gemm", &tensorrt_llm::torch_ext::preprocess_weights_for_mixed_gemm); static auto unpack_int4_packed_tensor_to_int8 = torch::RegisterOperators( - "trtllm::unpack_int4_packed_tensor_to_int8", &torch_ext::unpack_int4_packed_tensor_to_int8); + "trtllm::unpack_int4_packed_tensor_to_int8", &tensorrt_llm::torch_ext::unpack_int4_packed_tensor_to_int8); -static auto pack_int8_tensor_to_packed_int4 - = torch::RegisterOperators("trtllm::pack_int8_tensor_to_packed_int4", &torch_ext::pack_int8_tensor_to_packed_int4); +static auto pack_int8_tensor_to_packed_int4 = torch::RegisterOperators( + "trtllm::pack_int8_tensor_to_packed_int4", &tensorrt_llm::torch_ext::pack_int8_tensor_to_packed_int4); // Utility methods exposed purely for unit tests in torch. static auto _symmetric_quantize_last_axis_of_batched_matrix = torch::RegisterOperators("trtllm::_symmetric_quantize_last_axis_of_batched_matrix", - &torch_ext::_symmetric_quantize_last_axis_of_batched_matrix); + &tensorrt_llm::torch_ext::_symmetric_quantize_last_axis_of_batched_matrix); -static auto add_bias_and_interleave_int4s - = torch::RegisterOperators("trtllm::_add_bias_and_interleave_int4s", &torch_ext::add_bias_and_interleave_int4s); +static auto add_bias_and_interleave_int4s = torch::RegisterOperators( + "trtllm::_add_bias_and_interleave_int4s", &tensorrt_llm::torch_ext::add_bias_and_interleave_int4s); -static auto add_bias_and_interleave_int8s - = torch::RegisterOperators("trtllm::_add_bias_and_interleave_int8s", &torch_ext::add_bias_and_interleave_int8s); +static auto add_bias_and_interleave_int8s = torch::RegisterOperators( + "trtllm::_add_bias_and_interleave_int8s", &tensorrt_llm::torch_ext::add_bias_and_interleave_int8s); -static auto permute_B_rows_for_mixed_gemm - = torch::RegisterOperators("trtllm::_permute_B_rows_for_mixed_gemm", &torch_ext::permute_B_rows_for_mixed_gemm); +static auto permute_B_rows_for_mixed_gemm = torch::RegisterOperators( + "trtllm::_permute_B_rows_for_mixed_gemm", &tensorrt_llm::torch_ext::permute_B_rows_for_mixed_gemm); -static auto subbyte_transpose = torch::RegisterOperators("trtllm::_subbyte_transpose", &torch_ext::subbyte_transpose); +static auto subbyte_transpose + = torch::RegisterOperators("trtllm::_subbyte_transpose", &tensorrt_llm::torch_ext::subbyte_transpose); -static auto mxfp4_dequantize_unswizzled - = torch::RegisterOperators("trtllm::mxfp4_dequantize_unswizzled", &torch_ext::mxfp4_dequantize_unswizzled); +static auto mxfp4_dequantize_unswizzled = torch::RegisterOperators( + "trtllm::mxfp4_dequantize_unswizzled", &tensorrt_llm::torch_ext::mxfp4_dequantize_unswizzled); diff --git a/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp index 303ed40117..221cd98b5f 100644 --- a/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp @@ -178,6 +178,7 @@ void testSendMRecv() } else if (rank == 1) { +#if ENABLE_MULTI_DEVICE MPI_Message msg; MPI_Status status; comm.mprobe(0, tag, &msg, &status); @@ -190,6 +191,7 @@ void testSendMRecv() MPICHECK( MPI_Mrecv(&value, count, getMpiDtype(mpi::MpiTypeConverter>::value), &msg, &status)); EXPECT_EQ(value, expectedValue); +#endif // ENABLE_MULTI_DEVICE } } diff --git a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp index bf4ddd2141..88533ce7ca 100644 --- a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp @@ -36,7 +36,7 @@ namespace mpi = tensorrt_llm::mpi; namespace tr = tensorrt_llm::runtime; namespace nccl_util = tensorrt_llm::common::nccl_util; -using ::getComm; +using tensorrt_llm::getComm; // Helper function to create a split communicator for testing // This allows us to test cleanup behavior explicitly by controlling the lifetime diff --git a/cpp/tests/unit_tests/thop/thUtilsTest.cpp b/cpp/tests/unit_tests/thop/thUtilsTest.cpp index 262609cad8..06bf41b8fb 100644 --- a/cpp/tests/unit_tests/thop/thUtilsTest.cpp +++ b/cpp/tests/unit_tests/thop/thUtilsTest.cpp @@ -19,7 +19,7 @@ #include "tensorrt_llm/thop/thUtils.h" #include -using namespace torch_ext; +using namespace tensorrt_llm::torch_ext; TEST(ThUtils, ConvertShape2D) { diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index bd836df4f5..03aae58617 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -290,6 +290,11 @@ def generate_fmha_cu(project_dir, venv_python): move_if_updated(fmha_v2_dir / "generated/fmha_cubin.h", cubin_dir / "fmha_cubin.h") + # Copy generated source file (fmha_cubin.cpp) to the same directory as header + cpp_src = fmha_v2_dir / "generated/fmha_cubin.cpp" + if cpp_src.exists(): + move_if_updated(cpp_src, cubin_dir / "fmha_cubin.cpp") + generated_files = set() for cu_file in (fmha_v2_dir / "generated").glob("*sm*.cu"): dst_file = fmha_v2_cu_dir / os.path.basename(cu_file) diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py index 3a611a640c..348e665475 100644 --- a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py @@ -5,7 +5,10 @@ import torch import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils from ..._utils import get_sm_version -from .cute_dsl_custom_ops import GroupedGemmInputsHelper +from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE + +if IS_CUTLASS_DSL_AVAILABLE: + from .cute_dsl_custom_ops import GroupedGemmInputsHelper def _register_fake(): @@ -486,104 +489,106 @@ def _register_fake(): return gemm2_output.new_empty((num_rows_val, unpadded_hidden_size_val), dtype=gemm2_output.dtype) - @torch.library.register_fake("trtllm::moe_topk_sort") - def _( - routing_logits: torch.Tensor, - routing_bias: Optional[torch.Tensor], - num_experts: int, - top_k: int, - n_group: Optional[int], - topk_group: Optional[int], - local_expert_offset: int, - local_num_experts: int, - routed_scaling_factor: Optional[float], - tile_tokens_dim: int, - routing_method_type: int, - ) -> List[torch.Tensor]: - helper = GroupedGemmInputsHelper( - num_experts=num_experts, - top_k=top_k, - num_local_experts=local_num_experts, - local_expert_offset=local_expert_offset, - tile_size=tile_tokens_dim, - ) - num_tokens = routing_logits.size(0) - device = routing_logits.device - routing_bias_dtype = torch.bfloat16 if routing_bias is None else routing_bias.dtype - max_num_tiles = helper.get_max_num_tiles(num_tokens) - max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens) - tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), - dtype=torch.int32, - device=device) - permuted_idx_to_expanded_idx = torch.empty((max_num_permuted_tokens, ), - dtype=torch.int32, - device=device) - total_num_padded_tokens = torch.empty((1, ), - dtype=torch.int32, - device=device) - num_non_exiting_tiles = torch.empty((1, ), - dtype=torch.int32, - device=device) - new_token_final_scales = torch.empty((num_tokens, top_k), - dtype=routing_bias_dtype, - device=device) - return [ - tile_idx_to_expert_idx, tile_idx_to_mn_limit, - expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, - total_num_padded_tokens, num_non_exiting_tiles, - new_token_final_scales - ] + if IS_CUTLASS_DSL_AVAILABLE: - @torch.library.register_fake("trtllm::moe_sort") - def _( - token_selected_experts: torch.Tensor, - token_final_scales: torch.Tensor, - num_experts: int, - top_k: int, - local_expert_offset: int, - local_num_experts: int, - tile_tokens_dim: int, - ) -> List[torch.Tensor]: - helper = GroupedGemmInputsHelper( - num_experts=num_experts, - top_k=top_k, - num_local_experts=local_num_experts, - local_expert_offset=local_expert_offset, - tile_size=tile_tokens_dim, - ) - num_tokens = token_selected_experts.size(0) - device = token_selected_experts.device - max_num_tiles = helper.get_max_num_tiles(num_tokens) - max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens) - tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), - dtype=torch.int32, - device=device) - permuted_idx_to_expanded_idx = torch.empty((max_num_permuted_tokens, ), - dtype=torch.int32, - device=device) - total_num_padded_tokens = torch.empty((1, ), - dtype=torch.int32, - device=device) - num_non_exiting_tiles = torch.empty((1, ), - dtype=torch.int32, - device=device) - return [ - tile_idx_to_expert_idx, tile_idx_to_mn_limit, - expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, - total_num_padded_tokens, num_non_exiting_tiles - ] + @torch.library.register_fake("trtllm::moe_topk_sort") + def _( + routing_logits: torch.Tensor, + routing_bias: Optional[torch.Tensor], + num_experts: int, + top_k: int, + n_group: Optional[int], + topk_group: Optional[int], + local_expert_offset: int, + local_num_experts: int, + routed_scaling_factor: Optional[float], + tile_tokens_dim: int, + routing_method_type: int, + ) -> List[torch.Tensor]: + helper = GroupedGemmInputsHelper( + num_experts=num_experts, + top_k=top_k, + num_local_experts=local_num_experts, + local_expert_offset=local_expert_offset, + tile_size=tile_tokens_dim, + ) + num_tokens = routing_logits.size(0) + device = routing_logits.device + routing_bias_dtype = torch.bfloat16 if routing_bias is None else routing_bias.dtype + max_num_tiles = helper.get_max_num_tiles(num_tokens) + max_num_permuted_tokens = helper.get_max_num_permuted_tokens( + num_tokens) + tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), + dtype=torch.int32, + device=device) + permuted_idx_to_expanded_idx = torch.empty( + (max_num_permuted_tokens, ), dtype=torch.int32, device=device) + total_num_padded_tokens = torch.empty((1, ), + dtype=torch.int32, + device=device) + num_non_exiting_tiles = torch.empty((1, ), + dtype=torch.int32, + device=device) + new_token_final_scales = torch.empty((num_tokens, top_k), + dtype=routing_bias_dtype, + device=device) + return [ + tile_idx_to_expert_idx, tile_idx_to_mn_limit, + expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, + total_num_padded_tokens, num_non_exiting_tiles, + new_token_final_scales + ] + + @torch.library.register_fake("trtllm::moe_sort") + def _( + token_selected_experts: torch.Tensor, + token_final_scales: torch.Tensor, + num_experts: int, + top_k: int, + local_expert_offset: int, + local_num_experts: int, + tile_tokens_dim: int, + ) -> List[torch.Tensor]: + helper = GroupedGemmInputsHelper( + num_experts=num_experts, + top_k=top_k, + num_local_experts=local_num_experts, + local_expert_offset=local_expert_offset, + tile_size=tile_tokens_dim, + ) + num_tokens = token_selected_experts.size(0) + device = token_selected_experts.device + max_num_tiles = helper.get_max_num_tiles(num_tokens) + max_num_permuted_tokens = helper.get_max_num_permuted_tokens( + num_tokens) + tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), + dtype=torch.int32, + device=device) + permuted_idx_to_expanded_idx = torch.empty( + (max_num_permuted_tokens, ), dtype=torch.int32, device=device) + total_num_padded_tokens = torch.empty((1, ), + dtype=torch.int32, + device=device) + num_non_exiting_tiles = torch.empty((1, ), + dtype=torch.int32, + device=device) + return [ + tile_idx_to_expert_idx, tile_idx_to_mn_limit, + expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, + total_num_padded_tokens, num_non_exiting_tiles + ] @torch.library.register_fake("trtllm::moe_permute") def _( diff --git a/tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py b/tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py new file mode 100644 index 0000000000..54cf23d6cb --- /dev/null +++ b/tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py @@ -0,0 +1,83 @@ +"""Unit tests for FlashInfer fused MOE custom op.""" + +import flashinfer.fused_moe +import pytest +import torch + +import tensorrt_llm._torch.auto_deploy.custom_ops.fused_moe.torch_moe # noqa: F401 +import tensorrt_llm._torch.custom_ops.torch_custom_ops as trt_ops # noqa: F401 + + +def test_flashinfer_fused_moe_matches_torch_moe(): + """Test that flashinfer_fused_moe matches torch_moe reference.""" + torch.manual_seed(0) + + if not torch.cuda.is_available(): + pytest.skip("CUDA is required for flashinfer_fused_moe test") + + device = "cuda" + dtype = torch.bfloat16 + + # Small test case + M = 8 # tokens + HIDDEN_SIZE = 64 + INTERMEDIATE_SIZE = 128 + E = 4 # experts + top_k = 2 + + # Input + x = torch.randn(M, HIDDEN_SIZE, device=device, dtype=dtype) + + # Expert weights for gated MLP (SwiGLU) + # w1 = gate projection, w3 = up projection, w2 = down projection + w1_list = [ + torch.randn(INTERMEDIATE_SIZE, HIDDEN_SIZE, device=device, dtype=dtype) for _ in range(E) + ] + w2_list = [ + torch.randn(HIDDEN_SIZE, INTERMEDIATE_SIZE, device=device, dtype=dtype) for _ in range(E) + ] + w3_list = [ + torch.randn(INTERMEDIATE_SIZE, HIDDEN_SIZE, device=device, dtype=dtype) for _ in range(E) + ] + + # FlashInfer expects fc1 (gate + up concatenated) and fc2 (down) + # fc1_expert_weights: [E, 2*INTERMEDIATE_SIZE, HIDDEN_SIZE] + w1_w3_stacked = torch.stack( + [torch.cat([w3, w1], dim=0) for w1, w3 in zip(w1_list, w3_list)], dim=0 + ).contiguous() + + # fc2_expert_weights: [E, HIDDEN_SIZE, INTERMEDIATE_SIZE] + w2_stacked = torch.stack(w2_list, dim=0).contiguous() + + # Random routing with top-k normalization + router_logits = torch.randn(M, E, device=device, dtype=torch.float32) + routing_full = torch.softmax(router_logits, dim=-1) + routing_weights, selected_experts = torch.topk(routing_full, k=top_k, dim=-1) + routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(torch.float32) + + # FlashInfer fused MOE - call directly + out_flashinfer = flashinfer.fused_moe.cutlass_fused_moe( + input=x, + token_selected_experts=selected_experts.to(torch.int32), + token_final_scales=routing_weights, + fc1_expert_weights=w1_w3_stacked, + fc2_expert_weights=w2_stacked, + output_dtype=dtype, + quant_scales=[], + ) + + # Reference Torch MoE (gated_mlp with SwiGLU) + out_torch = torch.ops.auto_deploy.torch_moe( + x, + selected_experts, + routing_weights, + w1_weight=w1_list, # gate projection + w2_weight=w2_list, # down projection + w3_weight=w3_list, # up projection + mlp_style="gated_mlp", + act_fn="silu", + ) + + # Compare outputs + torch.testing.assert_close(out_flashinfer[0], out_torch, rtol=5e-1, atol=5e-1) From 2fec53dfa5d1c8b8d64fa80e90184bd597eae3ba Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 23:32:39 +0800 Subject: [PATCH 103/172] [TRTLLM-9637][feat] Support tool parser for Kimi K2 (#9830) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tensorrt_llm/serve/openai_server.py | 5 + tensorrt_llm/serve/postprocess_handlers.py | 6 +- .../serve/tool_parser/kimi_k2_tool_parser.py | 218 ++++++++++++++++++ .../serve/tool_parser/tool_parser_factory.py | 2 + .../unittest/llmapi/apps/test_tool_parsers.py | 144 ++++++++++++ 5 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 tensorrt_llm/serve/tool_parser/kimi_k2_tool_parser.py diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 3811c8a12e..70285d0aea 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -152,6 +152,10 @@ class OpenAIServer: else: self.use_harmony = (self.model_config.model_type == "gpt_oss") + self.tool_call_id_type = "random" # default tool call id type is random + if self.model_config.model_type == "kimi_k2": + self.tool_call_id_type = "kimi_k2" + # as disagg-worker self.disagg_cluster_storage = None self.disagg_cluster_worker = None @@ -554,6 +558,7 @@ class OpenAIServer: postproc_args.reasoning_parser = self.llm.args.reasoning_parser postproc_args.tool_parser = self.tool_parser + postproc_args.tool_call_id_type = self.tool_call_id_type if conversation and conversation[-1].get( "content") and conversation[-1].get("role") == get_role(): postproc_args.last_message_content = conversation[-1]["content"] diff --git a/tensorrt_llm/serve/postprocess_handlers.py b/tensorrt_llm/serve/postprocess_handlers.py index f9d78a5354..8a9c203805 100644 --- a/tensorrt_llm/serve/postprocess_handlers.py +++ b/tensorrt_llm/serve/postprocess_handlers.py @@ -54,6 +54,7 @@ class ChatPostprocArgs(PostprocArgs): default_factory=dict) tool_parser_dict: dict[int, BaseToolParser] = field(default_factory=dict) has_tool_call: dict[int, bool] = field(default_factory=dict) + tool_call_id_type: str = "random" @classmethod def from_request(cls, request: ChatCompletionRequest): @@ -223,7 +224,10 @@ def chat_stream_post_processor(rsp: GenerationResultBase, # Tool call ID should be generated only once per tool call if call_item.name: # First chunk: include ID and function name - tool_call_id = make_tool_call_id() + tool_call_id = make_tool_call_id( + id_type=args.tool_call_id_type, + func_name=call_item.name, + idx=call_item.tool_index) function_name = call_item.name else: # Subsequent chunks: null ID and name for argument deltas diff --git a/tensorrt_llm/serve/tool_parser/kimi_k2_tool_parser.py b/tensorrt_llm/serve/tool_parser/kimi_k2_tool_parser.py new file mode 100644 index 0000000000..ca2d0a7d7d --- /dev/null +++ b/tensorrt_llm/serve/tool_parser/kimi_k2_tool_parser.py @@ -0,0 +1,218 @@ +# Adapted from https://github.com/sgl-project/sglang/blob/083629c23564e1a64deaa052f1df5c5d914358d8/python/sglang/srt/function_call/kimik2_detector.py +import json +import re +from typing import List + +from tensorrt_llm.logger import logger + +from ..openai_protocol import ChatCompletionToolsParam as Tool +from .base_tool_parser import BaseToolParser +from .core_types import StreamingParseResult, StructureInfo, ToolCallItem, _GetInfoFunc + + +class KimiK2ToolParser(BaseToolParser): + """Detector for Kimi K2 model function call format. + + Format Structure: + ``` + <|tool_calls_section_begin|> + <|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|> + <|tool_calls_section_end|> + ``` + + Reference: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md + """ + + def __init__(self): + super().__init__() + + self.bot_token: str = "<|tool_calls_section_begin|>" + self.eot_token: str = "<|tool_calls_section_end|>" + + self.tool_call_start_token: str = "<|tool_call_begin|>" + self.tool_call_end_token: str = "<|tool_call_end|>" + + self.tool_call_regex = re.compile( + r"<\|tool_call_begin\|>\s*(?P[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P\{.*?\})\s*<\|tool_call_end\|>" + ) + + self.stream_tool_call_portion_regex = re.compile( + r"<\|tool_call_begin\|>\s*(?P[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P\{.*)" + ) + + self._last_arguments = "" + + # Robust parser for ids like "functions.search:0" or fallback "search:0" + self.tool_call_id_regex = re.compile(r"^(?:functions\.)?(?P[\w\.]+):(?P\d+)$") + + def has_tool_call(self, text: str) -> bool: + """Check if the text contains a KimiK2 format tool call.""" + return self.bot_token in text + + def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult: + """One-time parsing: Detects and parses tool calls in the provided text. + + :param text: The complete text to parse. + :param tools: List of available tools. + :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls. + """ + if self.bot_token not in text: + return StreamingParseResult(normal_text=text, calls=[]) + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = self.tool_call_regex.findall(text) + tool_indices = self._get_tool_indices(tools) + + logger.debug("function_call_tuples: %s", function_call_tuples) + + tool_calls = [] + for match in function_call_tuples: + function_id, function_args = match + m = self.tool_call_id_regex.match(function_id) + if not m: + logger.warning("Unexpected tool_call_id format: %s", function_id) + continue + function_name = m.group("name") + function_idx = int(m.group("index")) + + if function_name not in tool_indices: + logger.warning(f"Model attempted to call undefined function: {function_name}") + continue + + logger.debug(f"function_name {function_name}") + + tool_calls.append( + ToolCallItem( + tool_index=function_idx, + name=function_name, + parameters=function_args, + ) + ) + + content = text[: text.find(self.bot_token)] + return StreamingParseResult(normal_text=content, calls=tool_calls) + + except Exception as e: + logger.error(f"Error in detect_and_parse: {e}") + # return the normal text if parsing fails + return StreamingParseResult(normal_text=text) + + def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult: + """Streaming incremental parsing tool calls for KimiK2 format.""" + self._buffer += new_text + current_text = self._buffer + + # Check if we have a tool call (either the start token or individual tool call) + has_tool_call = self.bot_token in current_text or self.tool_call_start_token in current_text + + if not has_tool_call: + self._buffer = "" + for e_token in [self.eot_token, self.tool_call_end_token]: + if e_token in new_text: + new_text = new_text.replace(e_token, "") + return StreamingParseResult(normal_text=new_text) + + if not hasattr(self, "_tool_indices"): + self._tool_indices = self._get_tool_indices(tools) + + calls: list[ToolCallItem] = [] + try: + match = self.stream_tool_call_portion_regex.search(current_text) + if match: + function_id = match.group("tool_call_id") + function_args = match.group("function_arguments") + + m = self.tool_call_id_regex.match(function_id) + if not m: + logger.warning("Unexpected tool_call_id format: %s", function_id) + return StreamingParseResult(normal_text="", calls=calls) + function_name = m.group("name") + + # Initialize state if this is the first tool call + if self.current_tool_id == -1: + self.current_tool_id = 0 + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [""] + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + + if not self.current_tool_name_sent: + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=function_name, + parameters="", + ) + ) + self.current_tool_name_sent = True + # Store the tool call info for serving layer completions endpoint + self.prev_tool_call_arr[self.current_tool_id] = { + "name": function_name, + "arguments": {}, + } + else: + argument_diff = ( + function_args[len(self._last_arguments) :] + if function_args.startswith(self._last_arguments) + else function_args + ) + + parsed_args_diff = argument_diff.split("<|tool_call_end|>", 1)[0] + + if parsed_args_diff: + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=None, + parameters=parsed_args_diff, + ) + ) + self._last_arguments += argument_diff + self.streamed_args_for_tool[self.current_tool_id] += parsed_args_diff + + parsed_args = function_args.split("<|tool_call_end|>", 1)[0] + try: + parsed_args = json.loads(parsed_args) + self.prev_tool_call_arr[self.current_tool_id]["arguments"] = parsed_args + + # Find the end of the current tool call and remove only that part from buffer + tool_call_end_pattern = r"<\|tool_call_begin\|>.*?<\|tool_call_end\|>" + match = re.search(tool_call_end_pattern, current_text, re.DOTALL) + if match: + # Remove the completed tool call from buffer, keep any remaining content + self._buffer = current_text[match.end() :] + else: + self._buffer = "" + + result = StreamingParseResult(normal_text="", calls=calls) + self.current_tool_id += 1 + self._last_arguments = "" + self.current_tool_name_sent = False + return result + except json.JSONDecodeError: + pass + + return StreamingParseResult(normal_text="", calls=calls) + + except Exception as e: + logger.error(f"Error in parse_streaming_increment: {e}") + return StreamingParseResult(normal_text=current_text) + + def structure_info(self) -> _GetInfoFunc: + """Return function that creates StructureInfo for guided generation.""" + + def get_info(name: str) -> StructureInfo: + return StructureInfo( + begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>", + end="<|tool_call_end|><|tool_calls_section_end|>", + trigger="<|tool_calls_section_begin|>", + ) + + return get_info diff --git a/tensorrt_llm/serve/tool_parser/tool_parser_factory.py b/tensorrt_llm/serve/tool_parser/tool_parser_factory.py index 8a9bbe298c..3cf37c01ff 100644 --- a/tensorrt_llm/serve/tool_parser/tool_parser_factory.py +++ b/tensorrt_llm/serve/tool_parser/tool_parser_factory.py @@ -1,6 +1,7 @@ from typing import Type from .base_tool_parser import BaseToolParser +from .kimi_k2_tool_parser import KimiK2ToolParser from .qwen3_coder_parser import Qwen3CoderToolParser from .qwen3_tool_parser import Qwen3ToolParser @@ -9,6 +10,7 @@ class ToolParserFactory: parsers: dict[str, Type[BaseToolParser]] = { "qwen3": Qwen3ToolParser, "qwen3_coder": Qwen3CoderToolParser, + "kimi_k2": KimiK2ToolParser, } @staticmethod diff --git a/tests/unittest/llmapi/apps/test_tool_parsers.py b/tests/unittest/llmapi/apps/test_tool_parsers.py index 66ae337336..657257e0ca 100644 --- a/tests/unittest/llmapi/apps/test_tool_parsers.py +++ b/tests/unittest/llmapi/apps/test_tool_parsers.py @@ -23,6 +23,7 @@ from tensorrt_llm.serve.openai_protocol import (ChatCompletionToolsParam, FunctionDefinition) from tensorrt_llm.serve.tool_parser.base_tool_parser import BaseToolParser from tensorrt_llm.serve.tool_parser.core_types import StructureInfo +from tensorrt_llm.serve.tool_parser.kimi_k2_tool_parser import KimiK2ToolParser from tensorrt_llm.serve.tool_parser.qwen3_coder_parser import \ Qwen3CoderToolParser from tensorrt_llm.serve.tool_parser.qwen3_tool_parser import Qwen3ToolParser @@ -469,6 +470,149 @@ class BaseToolParserTestClass: assert len(result.calls) == 0 +class TestKimiK2ToolParser(BaseToolParserTestClass): + """Test suite for KimiK2ToolParser class.""" + + def make_parser(self): + return KimiK2ToolParser() + + def make_tool_parser_test_cases(self): + return ToolParserTestCases( + has_tool_call_true= + 'Some text <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "NYC"}<|tool_call_end|><|tool_calls_section_end|>', + detect_and_parse_single_tool=( + # Input text. + ('Normal text' + '<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "NYC"}<|tool_call_end|><|tool_calls_section_end|>' + ), + # Expected `normal_text`. + "Normal text", + # Expected `name`. + "get_weather", + # Expected `parameters`. + { + "location": "NYC" + }, + ), + detect_and_parse_multiple_tools=( + # Input text. + ('<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"LA"}<|tool_call_end|>\n' + '<|tool_call_begin|>functions.search_web:0<|tool_call_argument_begin|>{"query":"AI"}<|tool_call_end|><|tool_calls_section_end|>' + ), + # Expected names. + ("get_weather", "search_web"), + ), + detect_and_parse_malformed_tool= + ('<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>MALFORMED<|tool_call_end|><|tool_calls_section_end|>' + ), + detect_and_parse_with_parameters_key=( + # Input text. + ('<|tool_calls_section_begin|><|tool_call_begin|>functions.search_web:0<|tool_call_argument_begin|>{"query":"test"}<|tool_call_end|><|tool_calls_section_end|>' + ), + # Expected `name`. + "search_web", + # Expected `parameters`. + { + "query": "test" + }, + ), + parse_streaming_increment_partial_bot_token= + "<|tool_calls_section_begin|><|tool_call_be", + undefined_tool= + '<|tool_calls_section_begin|><|tool_call_begin|>functions.undefined_func:0<|tool_call_argument_begin|>{"arg":"any value"}<|tool_call_end|><|tool_calls_section_end|>', + ) + + def test_initialization(self, parser): + """Test that Qwen3ToolParser initializes correctly.""" + assert parser.bot_token == "<|tool_calls_section_begin|>" + assert parser.eot_token == "<|tool_calls_section_end|>" + + def test_parse_streaming_increment_complete_tool_call( + self, sample_tools, parser): + """Test streaming parser with complete tool call in chunks.""" + + # Send bot token + parser.parse_streaming_increment("<|tool_calls_section_begin|>", + sample_tools) + + # Send partial tool call with name + result = parser.parse_streaming_increment( + '<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{', + sample_tools) + + # Should send tool name + assert len(result.calls) == 1 + assert result.calls[0].name == "get_weather" + assert result.calls[0].parameters == "" + + # Send arguments + result = parser.parse_streaming_increment( + '"location":"SF"}<|tool_call_end|>', sample_tools) + + # Should stream arguments + assert len(result.calls) == 1 + assert json.loads(result.calls[0].parameters) == {"location": "SF"} + + def test_parse_streaming_increment_multiple_tools_streaming( + self, sample_tools, parser): + """Test streaming parser handles multiple tool calls.""" + + # First tool + parser.parse_streaming_increment('<|tool_calls_section_begin|>', + sample_tools) + parser.parse_streaming_increment( + '<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|>', + sample_tools) + + # Second tool + parser.parse_streaming_increment( + '<|tool_call_begin|>functions.search_web:0<|tool_call_argument_begin|>{"arg": "any value"}<|tool_call_end|>', + sample_tools) + + result = parser.parse_streaming_increment('<|tool_calls_section_end|>', + sample_tools) + # Should have started second tool + assert result.calls[0].name == "search_web" + assert result.calls[0].parameters == "" + assert result.calls[0].tool_index == 1 + + def test_structure_info_function(self): + """Test structure_info returns correct lambda function.""" + parser = KimiK2ToolParser() + func = parser.structure_info() + + info = func("test_function") + + assert isinstance(info, StructureInfo) + assert info.begin == '<|tool_calls_section_begin|><|tool_call_begin|>functions.test_function:0<|tool_call_argument_begin|>' + assert info.end == "<|tool_call_end|><|tool_calls_section_end|>" + assert info.trigger == "<|tool_calls_section_begin|>" + + def test_structure_info_different_names(self): + """Test structure_info works with different function names.""" + parser = KimiK2ToolParser() + func = parser.structure_info() + + info1 = func("get_weather") + info2 = func("search_web") + + assert "get_weather" in info1.begin + assert "search_web" in info2.begin + assert info1.end == info2.end == "<|tool_call_end|><|tool_calls_section_end|>" + + def test_kimi_k2_format_compliance(self, sample_tools, parser): + """Test that KimiK2ToolParser follows the documented format structure.""" + + # Test the exact format from the docstring + text = '<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"Tokyo"}<|tool_call_end|><|tool_calls_section_end|>' + + result = parser.detect_and_parse(text, sample_tools) + + assert len(result.calls) == 1 + assert result.calls[0].name == "get_weather" + assert json.loads(result.calls[0].parameters) == {"location": "Tokyo"} + + class TestQwen3ToolParser(BaseToolParserTestClass): """Test suite for Qwen3ToolParser class.""" From 9c59c9f9201ebbd7f5acc1dbabdeb2d223e8f53a Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Sat, 13 Dec 2025 00:10:05 +0800 Subject: [PATCH 104/172] [https://nvbugs/5643787][fix] remove the war path for notify to itself (#9834) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> --- .../nixl_utils/transferAgent.cpp | 19 ++++--------------- tests/integration/test_lists/waives.txt | 1 - 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp index 4d39d7f848..dbb7b9fc38 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp @@ -454,21 +454,10 @@ void NixlTransferAgent::invalidateRemoteAgent(std::string const& name) void NixlTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) { - if (name == mName) - { - // FIXME: nixl does not support gen notif to itself ,but support local transfer. we use local transfer to notify - // itself - MemoryDescs descs{MemoryType::kDRAM, {MemoryDesc{mDRamSrcBuffer}, MemoryDesc{mDRamDstBuffer}}}; - TransferRequest request{TransferOp::kWRITE, descs, descs, name, syncMessage}; - auto request_status = submitTransferRequests(request); - request_status->wait(); - } - else - { - auto status = mRawAgent->genNotif(name, syncMessage); - TLLM_CHECK_WITH_INFO( - status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str()); - } + + auto status = mRawAgent->genNotif(name, syncMessage); + TLLM_CHECK_WITH_INFO( + status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str()); } [[nodiscard]] std::unordered_map> NixlTransferAgent::getNotifiedSyncMessages() diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 10795f404a..f6c76b23d3 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -411,7 +411,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) -disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5726066) From 4cc4cbe926a3dac7925845db112dbe0e6a41b91d Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Sat, 13 Dec 2025 00:15:02 +0800 Subject: [PATCH 105/172] [https://nvbugs/5716787][fix] terminate nixl running when exiting (#9785) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Co-authored-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- .../tensorrt_llm/executor/cacheCommunicator.h | 1 + .../batch_manager/dataTransceiver.cpp | 10 ++++++++++ .../agent_utils/connection.cpp | 16 ++++++++++++++++ .../cache_transmission/agent_utils/connection.h | 2 ++ .../cache_transmission/mpi_utils/connection.cpp | 9 +++++++++ .../cache_transmission/mpi_utils/connection.h | 3 +++ .../ucx_utils/ucxCacheCommunicator.cpp | 7 ++++++- .../ucx_utils/ucxCacheCommunicator.h | 3 +++ tests/unittest/llmapi/test_llm_pytorch.py | 7 ++++--- 9 files changed, 54 insertions(+), 4 deletions(-) diff --git a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h index 045d9fbc69..9294e11398 100644 --- a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h +++ b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h @@ -66,6 +66,7 @@ public: [[nodiscard]] virtual std::vector getConnections(CommState const& state) = 0; [[nodiscard]] virtual CommState const& getCommState() const = 0; + [[nodiscard]] virtual bool isRunning() const = 0; }; } // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp index 07c1b83dbc..e92d9019aa 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp @@ -360,6 +360,12 @@ public: RequestInfo info; auto const* connection = isAgent ? agentConnectionManager->recvConnectionAndRequestInfo(info) : mManager->recvConnect(DataContext{TransceiverTag::kID_TAG}, &id, sizeof(id)); + if (connection == nullptr && !mManager->isRunning()) + { + TLLM_LOG_WARNING(" recvRequestInfo connection is nullptr, maybe the server is terminating"); + return info; + } + if (!isAgent) { TLLM_CHECK(id == TransceiverTag::Id::REQUEST_SEND); @@ -616,6 +622,10 @@ private: if (!mReadyResponses.empty()) { auto const& requestInfo = recvRequestInfo(); + if (mTerminate || !mManager->isRunning()) + { + return; + } auto reqId = requestInfo.getRequestId(); { diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp index ca391f3724..b9dcc22a57 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp @@ -319,6 +319,10 @@ AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo(batc { while (true) { + if (!mIsRunning) + { + return nullptr; + } updateUnhandledNotifications(); std::scoped_lock lock(mNotificationMutex); auto it = mUnhandledNotifications.begin(); @@ -491,6 +495,11 @@ void AgentConnectionManager::waitForNotification(std::string const& remoteAgentN while (true) { + if (!mIsRunning) + { + return; + } + updateUnhandledNotifications(); std::scoped_lock lock(mNotificationMutex); auto it = mUnhandledNotifications.begin(); @@ -587,6 +596,13 @@ std::string const& AgentConnectionManager::getAgentName() const AgentConnectionManager::~AgentConnectionManager() { + mIsRunning = false; m_Agent->deregisterMemory(mRegMemDescs); } + +bool AgentConnectionManager::isRunning() const +{ + return mIsRunning; +} + } // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h index 45d3618a2d..d5a780bf45 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h @@ -296,6 +296,7 @@ public: void waitForNotification(std::string const& remoteAgentName, NotificationType& expectedInfo); void waitForSyncInfo(std::string const& remoteAgentName, NotificationSyncInfo& syncInfo); void waitForReadySignal(std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo); + [[nodiscard]] bool isRunning() const override; private: std::map> mConnections; @@ -309,6 +310,7 @@ private: int mDeviceId; std::string mAgentName; MemoryDescs mRegMemDescs; + std::atomic mIsRunning{true}; }; } // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.cpp index c677ba62b2..cf90cf81f1 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.cpp @@ -77,4 +77,13 @@ CommState const& MpiConnectionManager::getCommState() const return mCommState; } +bool MpiConnectionManager::isRunning() const +{ + return mIsRunning; +} + +MpiConnectionManager::~MpiConnectionManager() +{ + mIsRunning = false; +} } // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.h index aca83131ec..4c5d7873ce 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.h @@ -42,14 +42,17 @@ class MpiConnectionManager : public ConnectionManager { public: MpiConnectionManager(mpi::MpiComm const* comm); + ~MpiConnectionManager(); MpiConnection const* recvConnect(DataContext const& ctx, void* data, size_t size) override; [[nodiscard]] std::vector getConnections(CommState const& state) override; [[nodiscard]] CommState const& getCommState() const override; + [[nodiscard]] bool isRunning() const override; private: mpi::MpiComm const* mComm; std::map mConnections; CommState mCommState; + std::atomic mIsRunning{true}; }; } // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.cpp b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.cpp index 4c844968ea..4ad1e7bffc 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.cpp @@ -504,7 +504,7 @@ UcxConnectionManager::~UcxConnectionManager() socket.close(); mZmqRepThread.join(); } - + mIsRunning = false; mZmqRepSocket.close(); mZmqContext.close(); @@ -673,6 +673,11 @@ std::vector UcxConnectionManager::getConnections(CommState co return ret; } +bool UcxConnectionManager::isRunning() const +{ + return mIsRunning; +} + CommState const& UcxConnectionManager::getCommState() const { return mCommState; diff --git a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.h b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.h index 5ce7354489..405871abc1 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.h @@ -62,6 +62,7 @@ private: zmq::socket_t mZmqRepSocket; std::string mZmqRepEndpoint; std::thread mZmqRepThread; + std::atomic mIsRunning{true}; UcxConnection::ConnectionIdType getNewConnectionId(std::shared_ptr const& newEp); UcxConnection::ConnectionIdType addConnection(std::string const& ip, uint16_t port); @@ -85,6 +86,8 @@ public: { return mRank; } + + [[nodiscard]] bool isRunning() const override; }; #if defined(__clang__) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 6f17a4cc37..04d653b842 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -1052,8 +1052,9 @@ def test_llm_context_only_timed_out(): @pytest.mark.part0 @skip_ray @pytest.mark.parametrize("sender_future_timeout_ms", [100, 1000]) -def test_llm_context_only_timed_out_kv_cache_exhausted( - sender_future_timeout_ms): +@pytest.mark.parametrize("backend", ["NIXL", "UCX"]) +def test_llm_context_only_timed_out_kv_cache_exhausted(sender_future_timeout_ms, + backend): tp_size = 1 use_overlap = False enable_iter_req_stats = False @@ -1073,7 +1074,7 @@ def test_llm_context_only_timed_out_kv_cache_exhausted( kv_cache_config=kv_cache_config, tensor_parallel_size=tp_size, cache_transceiver_config=CacheTransceiverConfig( - backend="UCX", + backend=backend, kv_transfer_timeout_ms=1000, kv_transfer_sender_future_timeout_ms=sender_future_timeout_ms), **llm_args_extra) From cd4e6395369c0314c9bd2cb94a3c5a9334b37330 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Sat, 13 Dec 2025 00:52:30 +0800 Subject: [PATCH 106/172] [None][feat] Async pp send. (#9952) Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- .../_torch/distributed/communicator.py | 57 +++++++++++++++---- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py index 67790b240a..5e4968f298 100644 --- a/tensorrt_llm/_torch/distributed/communicator.py +++ b/tensorrt_llm/_torch/distributed/communicator.py @@ -16,6 +16,7 @@ try: except Exception: MPI = None # deferred; functions will error if used when ENABLE_MULTI_DEVICE is True +from tensorrt_llm._torch.hostfunc import hostfunc from tensorrt_llm._utils import (mpi_allgather, mpi_barrier, mpi_comm, mpi_disabled, mpi_isend, mpi_isend_object, mpi_recv, mpi_recv_object, mpi_send, @@ -782,18 +783,57 @@ class TorchDist(Distributed): return ret[0] -class PPCommNCCL: +class PPCommBase: def __init__(self, global_mapping: Mapping): self.mapping = global_mapping + self.tensor_ready_event = torch.cuda.Event() + self.send_stream = torch.cuda.Stream() + self.tensor_cache = {} + + def _cache_tensor(self, tensor: torch.Tensor): + cache_id = id(tensor) + self.tensor_cache[cache_id] = tensor + + @hostfunc + def _release_tensor(self, tensor: torch.Tensor): + cache_id = id(tensor) + del self.tensor_cache[cache_id] + + @abstractmethod + def direct_send(self, tensor: torch.Tensor, dest: int): + raise NotImplementedError("direct_send is not implemented") + + def send(self, tensor: torch.Tensor, dest: Optional[int] = None): + if dest is None: + dest = self.mapping.next_pp_rank() + + # NCCL send kernel in send_stream cannot be captured, + # so we send in the current stream instead in CUDA graph cases. + if torch.cuda.is_current_stream_capturing(): + self.direct_send(tensor, dest) + return + + self.tensor_ready_event.record() + with torch.cuda.stream(self.send_stream): + self.tensor_ready_event.wait() + # tensor may be released before NCCL send finished, + # so we cache it first and release it after send finished. + self._cache_tensor(tensor) + self.direct_send(tensor, dest) + self._release_tensor(tensor) + + +class PPCommNCCL(PPCommBase): + + def __init__(self, global_mapping: Mapping): + super().__init__(global_mapping) self.nccl_comm = torch.classes.trtllm.NcclCommunicatorOp( self.mapping.world_size, self.mapping.rank, ) - def send(self, tensor: torch.Tensor, dest: Optional[int] = None): - if dest is None: - dest = self.mapping.next_pp_rank() + def direct_send(self, tensor: torch.Tensor, dest: int): self.nccl_comm.send(tensor, dest) def recv(self, tensor: torch.Tensor, src: Optional[int] = None): @@ -802,10 +842,10 @@ class PPCommNCCL: self.nccl_comm.recv(tensor, src) -class PPCommTorch: +class PPCommTorch(PPCommBase): def __init__(self, global_mapping: Mapping): - self.mapping = global_mapping + super().__init__(global_mapping) self.pg = self.mapping.pp_group_pg self.pg_group = self.mapping.pp_group @@ -813,10 +853,7 @@ class PPCommTorch: assert global_rank in self.pg_group return self.pg_group.index(global_rank) - def send(self, tensor: torch.Tensor, dest: Optional[int] = None): - if dest is None: - dest = self.mapping.next_pp_rank() - + def direct_send(self, tensor: torch.Tensor, dest: int): self.pg.send([tensor], self._global_to_local_rank(dest), tag=0).wait() def recv(self, tensor: torch.Tensor, src: Optional[int] = None): From 246a8775712f1e1503933a6665e824d3c680c461 Mon Sep 17 00:00:00 2001 From: yuanjingx87 <197832395+yuanjingx87@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:10:32 -0800 Subject: [PATCH 107/172] [None][infra] Remove generate lockfile schedule for 1.2.0rc4.post1 branch (#9945) Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> --- jenkins/GenerateLock.groovy | 1 - 1 file changed, 1 deletion(-) diff --git a/jenkins/GenerateLock.groovy b/jenkins/GenerateLock.groovy index f059cf69f2..d700a062b1 100644 --- a/jenkins/GenerateLock.groovy +++ b/jenkins/GenerateLock.groovy @@ -114,7 +114,6 @@ pipeline { triggers { parameterizedCron(''' H 2 * * * %branchName=main;repoUrlKey=tensorrt_llm_github - H 3 * * * %branchName=release/1.2.0rc4.post1;repoUrlKey=tensorrt_llm_github ''') } From 614745215826fcdcf51fabff3f0158e5cf362a71 Mon Sep 17 00:00:00 2001 From: tburt-nv <195370667+tburt-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:35:31 -0800 Subject: [PATCH 108/172] [https://nvbugs/4141427][chore] Add more details to LICENSE file (#9881) Signed-off-by: Tyler Burt <195370667+tburt-nv@users.noreply.github.com> --- LICENSE | 134 +++++++++++++++++- cpp/kernels/fmha_v2/Makefile | 22 +-- cpp/kernels/fmha_v2/setup.py | 40 ++---- cpp/kernels/fmha_v2/train_ops/Makefile | 22 +-- cpp/kernels/fmha_v2/train_ops/train_setup.py | 44 +++--- .../nvrtcWrapper/CMakeLists.txt | 23 +-- .../kernels/selectiveScan/selectiveScan.h | 14 +- examples/mmlu.py | 45 ++---- scripts/generate_lock_file.py | 21 +-- tensorrt_llm/evaluate/mmlu.py | 44 ++---- .../deterministic/mixtral_deterministic.py | 43 ++---- tests/scripts/cute_dsl_kernels/testing.py | 20 +-- 12 files changed, 265 insertions(+), 207 deletions(-) diff --git a/LICENSE b/LICENSE index 7582da94bb..350926f256 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,84 @@ -Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -Portions of this project are under the following copyright: -- Copyright contributors to the vLLM project +This project is licensed under the Apache 2.0 license, whose full license text is available below. + +This project contains portions of code that are based on or derived from +other open source projects, which may have different licenses whose text +is available below. + +All modifications and additions to other projects are licensed under the +Apache License 2.0 unless otherwise specified. Please refer to the individual +file headers for specific copyright and license information. + +Below is a list of other projects that have portions contained by this project: + +-------------------------------------------------------------------------------- +causal-conv1d +-------------------------------------------------------------------------------- +Original Source: https://github.com/Dao-AILab/causal-conv1d +Copyright (c) 2024, Tri Dao. +Licensed under the BSD 3-Clause License + +-------------------------------------------------------------------------------- +flash-linear-attention +-------------------------------------------------------------------------------- +Original Source: https://github.com/fla-org/flash-linear-attention +Copyright (c) 2023-2025 Songlin Yang +Licensed under the MIT License + +-------------------------------------------------------------------------------- +InstructEval +-------------------------------------------------------------------------------- +Original Source: https://github.com/declare-lab/instruct-eval +Copyright (c) 2020 Dan Hendrycks +Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab +Licensed under the MIT License + +-------------------------------------------------------------------------------- +Mamba +-------------------------------------------------------------------------------- +Original Source: https://github.com/state-spaces/mamba +Copyright 2023 Tri Dao, Albert Gu +Licensed under the Apache License 2.0 + +-------------------------------------------------------------------------------- +SGLang +-------------------------------------------------------------------------------- +Original Source: https://github.com/sgl-project/sglang +Copyright contributors to the SGLang project +Licensed under the Apache License 2.0 + +-------------------------------------------------------------------------------- +Text Generation Inference +-------------------------------------------------------------------------------- +Original Source: https://github.com/huggingface/text-generation-inference +Copyright 2022 Hugging Face +Licensed under the Apache License 2.0 + +-------------------------------------------------------------------------------- +Transformers +-------------------------------------------------------------------------------- +Original Source: https://github.com/huggingface/transformers +Copyright 2018 The HuggingFace Team +Licensed under the Apache License 2.0 + +-------------------------------------------------------------------------------- +XGrammar +-------------------------------------------------------------------------------- +Original Source: https://github.com/mlc-ai/xgrammar +Copyright (c) 2024 by XGrammar Contributors +Licensed under the Apache License 2.0 + +-------------------------------------------------------------------------------- +vLLM +-------------------------------------------------------------------------------- +Original Source: https://github.com/vllm-project/vllm +Copyright contributors to the vLLM project +Licensed under the Apache License 2.0 + +================================================================================ + Apache 2.0 LICENSE +================================================================================ Apache License Version 2.0, January 2004 @@ -204,3 +281,54 @@ Portions of this project are under the following copyright: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +================================================================================ + MIT LICENSE +================================================================================ + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +================================================================================ + BSD 3-Clause License +================================================================================ + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cpp/kernels/fmha_v2/Makefile b/cpp/kernels/fmha_v2/Makefile index e85668ce58..d441deb620 100644 --- a/cpp/kernels/fmha_v2/Makefile +++ b/cpp/kernels/fmha_v2/Makefile @@ -1,18 +1,18 @@ # ################################################################################################## -# Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# Redistribution and use in source and binary forms, with or without modification, are not permit- -# ted. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR -# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND -# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# http://www.apache.org/licenses/LICENSE-2.0 # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ################################################################################################## # ################################################################################################# diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 75b4f2f56e..0bd9329a6f 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -200,38 +200,22 @@ ns_close = r""" copyright = '''\ /*************************************************************************************************** - * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * - * Redistribution and use in source and binary forms, with or without modification, are not permit- - * ted. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. **************************************************************************************************/ -''' if not generate_cu_trtllm else r"""/* -* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & -* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -""" +''' makefile_template = '''\ diff --git a/cpp/kernels/fmha_v2/train_ops/Makefile b/cpp/kernels/fmha_v2/train_ops/Makefile index 54f14e113c..a28edb0490 100644 --- a/cpp/kernels/fmha_v2/train_ops/Makefile +++ b/cpp/kernels/fmha_v2/train_ops/Makefile @@ -1,18 +1,18 @@ # ################################################################################################## -# Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# Redistribution and use in source and binary forms, with or without modification, are not permit- -# ted. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR -# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND -# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# http://www.apache.org/licenses/LICENSE-2.0 # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ################################################################################################## # ################################################################################################# diff --git a/cpp/kernels/fmha_v2/train_ops/train_setup.py b/cpp/kernels/fmha_v2/train_ops/train_setup.py index 9669b294cb..dd3364182d 100755 --- a/cpp/kernels/fmha_v2/train_ops/train_setup.py +++ b/cpp/kernels/fmha_v2/train_ops/train_setup.py @@ -32,20 +32,20 @@ dtype2traits = { fmha_dgrad_v2_flash_attention_template = '''\ /*************************************************************************************************** - * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * - * Redistribution and use in source and binary forms, with or without modification, are not permit- - * ted. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. **************************************************************************************************/ #include "fused_multihead_attention_fprop.h" @@ -157,20 +157,20 @@ void run_fmha_dgrad_v2_flash_attention_{dtype}_S_{head_size}_sm{sm}( fmha_fprop_v2_flash_attention_template = '''\ /*************************************************************************************************** - * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 * - * Redistribution and use in source and binary forms, with or without modification, are not permit- - * ted. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. **************************************************************************************************/ #include "fused_multihead_attention_fprop.h" diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt index 79b0c2ed08..58227e493b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/CMakeLists.txt @@ -1,12 +1,19 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# ~~~ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual -# property and proprietary rights in and to this material, related documentation -# and any modifications thereto. Any use, reproduction, disclosure or -# distribution of this material and related documentation without an express -# license agreement from NVIDIA CORPORATION or its affiliates is strictly -# prohibited. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ~~~ # Add xqa subdirectory for xqa_sources_h target. add_subdirectory(${TRT_LLM_ROOT_DIR}/cpp/kernels/xqa xqa_build) diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h index 88f28b991b..b020c40985 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h @@ -1,6 +1,7 @@ /* * Adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan.h * Copyright (c) 2023, Tri Dao. + * Copyright (c) 2022-2024 NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,19 +14,6 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - * - * Not a contribution - * Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as - * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: LicenseRef-NvidiaProprietary - * - * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual - * property and proprietary rights in and to this material, related - * documentation and any modifications thereto. Any use, reproduction, - * disclosure or distribution of this material and related documentation - * without an express license agreement from NVIDIA CORPORATION or - * its affiliates is strictly prohibited. */ #pragma once diff --git a/examples/mmlu.py b/examples/mmlu.py index 9564ed7a48..82bb9a7ec9 100644 --- a/examples/mmlu.py +++ b/examples/mmlu.py @@ -1,38 +1,19 @@ -# MIT License +# SPDX-FileCopyrightText: Copyright (c) 2020 Dan Hendrycks +# SPDX-FileCopyrightText: Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 and MIT # -# Copyright (c) 2020 Dan Hendrycks -# Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +# http://www.apache.org/licenses/LICENSE-2.0 # -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# Not a contribution -# Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as -# NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NvidiaProprietary -# -# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual -# property and proprietary rights in and to this material, related -# documentation and any modifications thereto. Any use, reproduction, -# disclosure or distribution of this material and related documentation -# without an express license agreement from NVIDIA CORPORATION or -# its affiliates is strictly prohibited. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Adapted from https://github.com/declare-lab/instruct-eval Helper script to compare TRTLLM and HF models on the MMLU dataset. Example usage: diff --git a/scripts/generate_lock_file.py b/scripts/generate_lock_file.py index a8cf4a3cdf..9b37858c0e 100755 --- a/scripts/generate_lock_file.py +++ b/scripts/generate_lock_file.py @@ -1,13 +1,18 @@ #!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual -# property and proprietary rights in and to this material, related -# documentation and any modifications thereto. Any use, reproduction, -# disclosure or distribution of this material and related documentation -# without an express license agreement from NVIDIA CORPORATION or -# its affiliates is strictly prohibited. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Generates pyproject.toml and poetry.lock files from requirements.txt diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py index 15cfbdf65d..89be382396 100644 --- a/tensorrt_llm/evaluate/mmlu.py +++ b/tensorrt_llm/evaluate/mmlu.py @@ -1,39 +1,21 @@ -# MIT License +# SPDX-FileCopyrightText: Copyright (c) 2020 Dan Hendrycks +# SPDX-FileCopyrightText: Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 and MIT # -# Copyright (c) 2020 Dan Hendrycks -# Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +# http://www.apache.org/licenses/LICENSE-2.0 # -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json -# Not a contribution -# Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as -# NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NvidiaProprietary -# -# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual -# property and proprietary rights in and to this material, related -# documentation and any modifications thereto. Any use, reproduction, -# disclosure or distribution of this material and related documentation -# without an express license agreement from NVIDIA CORPORATION or -# its affiliates is strictly prohibited. import math from typing import Any, Iterable, List, Optional, Union diff --git a/tests/integration/defs/deterministic/mixtral_deterministic.py b/tests/integration/defs/deterministic/mixtral_deterministic.py index 914a494bfb..53abff63d7 100644 --- a/tests/integration/defs/deterministic/mixtral_deterministic.py +++ b/tests/integration/defs/deterministic/mixtral_deterministic.py @@ -1,38 +1,17 @@ -# MIT License +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# Copyright (c) 2020 Dan Hendrycks -# Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +# http://www.apache.org/licenses/LICENSE-2.0 # -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# Not a contribution -# Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as -# NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NvidiaProprietary -# -# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual -# property and proprietary rights in and to this material, related -# documentation and any modifications thereto. Any use, reproduction, -# disclosure or distribution of this material and related documentation -# without an express license agreement from NVIDIA CORPORATION or -# its affiliates is strictly prohibited. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse import json diff --git a/tests/scripts/cute_dsl_kernels/testing.py b/tests/scripts/cute_dsl_kernels/testing.py index 55fd37cc36..f7fe6fa8d2 100644 --- a/tests/scripts/cute_dsl_kernels/testing.py +++ b/tests/scripts/cute_dsl_kernels/testing.py @@ -1,13 +1,17 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# Use of this software is governed by the terms and conditions of the -# NVIDIA End User License Agreement (EULA), available at: -# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Any use, reproduction, disclosure, or distribution of this software -# and related documentation outside the scope permitted by the EULA -# is strictly prohibited. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from functools import partial from typing import Callable, Optional, Union From 461446045e62028ff6bc6516f8e5858e26890742 Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:49:25 -0800 Subject: [PATCH 109/172] [TRTLLM-9493][feat] Add helixPostProcessNative kernel for cp_dim=2 (#9924) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/helixKernels.cu | 189 ++++++++++++ cpp/tensorrt_llm/kernels/helixKernels.h | 3 + cpp/tensorrt_llm/thop/helixPostProcessOp.cpp | 97 ++++++ .../_torch/custom_ops/cpp_custom_ops.py | 7 + .../thop/parallel/test_helix_postprocess.py | 284 +++++++++++++----- 5 files changed, 508 insertions(+), 72 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/helixKernels.cu b/cpp/tensorrt_llm/kernels/helixKernels.cu index ffaa490b14..ed4e80a808 100644 --- a/cpp/tensorrt_llm/kernels/helixKernels.cu +++ b/cpp/tensorrt_llm/kernels/helixKernels.cu @@ -34,6 +34,9 @@ TRTLLM_NAMESPACE_BEGIN namespace kernels { + +namespace +{ static constexpr int WARP_SIZE = 32; // Utility: warp-level corrected sum @@ -207,6 +210,156 @@ __global__ void helix_postprocess_kernel( } } +static constexpr int MAX_THREADS = 256; +static constexpr int MAX_KV_LORA_BYTES = (MAX_THREADS - WARP_SIZE) * BYTES_O_PER_THREAD; + +// Kernel: fused helix post-processing +// output: [num_tokens, num_heads * kv_lora_rank] (half) +// gathered_o: [num_tokens, num_heads, cp_size, kv_lora_rank] (half) +// gathered_stats: [num_tokens, num_heads, cp_size, 2] (fp32) +// note: we explicitly avoid using restrict here, to avoid getting ld.global.nc +// which may have longer latency +template +__global__ void __launch_bounds__(MAX_THREADS) helix_postprocess_kernel_native( + T* output, T const* gathered_o, float2 const* gathered_stats, int cp_size, int kv_lora_rank) +{ + // Each block processes one (token, head) + // gridDim.x: num_tokens, gridDim.y: num_heads + // there are two separate types of warps: + // warp 0 calculates the correction values (one per cp_size) + // all other warps pre-load the gathered_o elements for the current token/head + // and once warp 0 is done, all other warps can start accumulating the output + static constexpr int NUM_O_PER_THREAD = BYTES_O_PER_THREAD / sizeof(T); + + int tok_idx = blockIdx.x; + int head_idx = blockIdx.y; + int num_tokens = gridDim.x; + int num_heads = gridDim.y; + + int const cp_size_aligned = ((cp_size + NUM_PRE_LOAD - 1) / NUM_PRE_LOAD) * NUM_PRE_LOAD; + __shared__ float smem_correction[MAX_CP]; + + int lane_idx = threadIdx.x % WARP_SIZE; + int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0); + + // all warps except first pre-load the gathered_o elements for the current + // token/head + T const* gathered_o_off; + gathered_o_off = gathered_o + tok_idx * num_heads * cp_size * kv_lora_rank + head_idx * cp_size * kv_lora_rank; + // we subtract WARP_SIZE because first warp is not participating in pre-load + gathered_o_off += (threadIdx.x - WARP_SIZE) * NUM_O_PER_THREAD; + float4 const* gathered_o_16b = reinterpret_cast(gathered_o_off); + int gathered_16b_stride = (kv_lora_rank) / NUM_O_PER_THREAD; + int stats_offset = tok_idx * num_heads * cp_size + head_idx * cp_size; + int stats_stride = 1; + + // here we have to wait for memory operations of the previous kernel to + // complete +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif + + float max_values[MAX_CP_VAL_PER_THREAD]; + float sum_values[MAX_CP_VAL_PER_THREAD]; + T vals[NUM_PRE_LOAD][NUM_O_PER_THREAD]; + float final_sum[NUM_O_PER_THREAD]; + float corr_vals[NUM_PRE_LOAD]; + T output_typed[NUM_O_PER_THREAD]; + + if (warp_idx == 0) + { + // the warp collectively calculates the correction values +#pragma unroll + for (int cp_val_idx = 0; cp_val_idx < MAX_CP_VAL_PER_THREAD; ++cp_val_idx) + { + auto cp_idx = cp_val_idx * WARP_SIZE + lane_idx; + auto stats_idx = stats_offset + cp_idx * stats_stride; + float2 stats = cp_idx < cp_size ? gathered_stats[stats_idx] : make_float2(-INFINITY, 0.F); + max_values[cp_val_idx] = stats.x; + sum_values[cp_val_idx] = stats.y; + } + float corrected_values[MAX_CP_VAL_PER_THREAD]; + warpReduceCorrectedSum(corrected_values, max_values, sum_values); +#pragma unroll + for (int cp_val_idx = 0; cp_val_idx < MAX_CP_VAL_PER_THREAD; ++cp_val_idx) + { + auto cp_idx = cp_val_idx * WARP_SIZE + lane_idx; + smem_correction[cp_idx] = corrected_values[cp_val_idx]; + } + } + else + { + // all other warps pre-load the gathered_o elements +#pragma unroll + for (int cp_idx = 0; cp_idx < NUM_PRE_LOAD && cp_idx < cp_size; ++cp_idx) + { + auto val = gathered_o_16b[cp_idx * gathered_16b_stride]; + *reinterpret_cast(vals[cp_idx]) = val; + } +#pragma unroll + for (int o_idx = 0; o_idx < NUM_O_PER_THREAD; ++o_idx) + { + final_sum[o_idx] = 0.F; + } + } + __syncthreads(); + + // warp 0 exits early + if (warp_idx == 0) + return; + + // here we can trigger the dependent kernels to start +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaTriggerProgrammaticLaunchCompletion(); +#endif + +#pragma unroll + for (int cp_idx = 0; cp_idx < NUM_PRE_LOAD && cp_idx < cp_size; ++cp_idx) + { + corr_vals[cp_idx] = smem_correction[cp_idx]; + } + + for (int cp_idx_base = NUM_PRE_LOAD; cp_idx_base < cp_size_aligned; cp_idx_base += NUM_PRE_LOAD) + { +#pragma unroll + for (int cp_idx = 0; cp_idx < NUM_PRE_LOAD; ++cp_idx) + { +#pragma unroll + for (int o_idx = 0; o_idx < NUM_O_PER_THREAD; ++o_idx) + { + final_sum[o_idx] += static_cast(vals[cp_idx][o_idx]) * corr_vals[cp_idx]; + } + } +#pragma unroll + for (int cp_idx = 0; cp_idx < NUM_PRE_LOAD; ++cp_idx) + { + *reinterpret_cast(vals[cp_idx]) = cp_idx_base + cp_idx < cp_size + ? gathered_o_16b[(cp_idx_base + cp_idx) * gathered_16b_stride] + : make_float4(0.F, 0.F, 0.F, 0.F); + corr_vals[cp_idx] = cp_idx_base + cp_idx < cp_size ? smem_correction[cp_idx_base + cp_idx] : 0.F; + } + } +#pragma unroll + for (int cp_idx = 0; cp_idx < NUM_PRE_LOAD && cp_idx < cp_size; ++cp_idx) + { +#pragma unroll + for (int o_idx = 0; o_idx < NUM_O_PER_THREAD; ++o_idx) + { + final_sum[o_idx] += static_cast(vals[cp_idx][o_idx]) * corr_vals[cp_idx]; + } + } +#pragma unroll + for (int o_idx = 0; o_idx < NUM_O_PER_THREAD; ++o_idx) + { + output_typed[o_idx] = static_cast(final_sum[o_idx]); + } + auto* output_off = output + tok_idx * num_heads * kv_lora_rank + head_idx * kv_lora_rank; + output_off += (threadIdx.x - WARP_SIZE) * NUM_O_PER_THREAD; + *reinterpret_cast(output_off) = *reinterpret_cast(output_typed); +} + +} // anonymous namespace + template void helixPostProcess(HelixPostProcParams const& params, cudaStream_t stream) { @@ -240,6 +393,42 @@ void helixPostProcess(HelixPostProcParams const& params, cudaStream_t stream) INSTANTIATE_POST_PROC(__half); INSTANTIATE_POST_PROC(__nv_bfloat16); +template +void helixPostProcessNative(HelixPostProcParams const& params, cudaStream_t stream) +{ + // Check that gathered_o is 16-byte aligned + TLLM_CHECK_WITH_INFO(reinterpret_cast(params.gathered_o) % 16 == 0, + "gathered_o must be 16-byte aligned for async memcpy"); + // TODO: Figure why this constraint is specific to this implementation and not legacy one. + TLLM_CHECK_WITH_INFO((params.kv_lora_rank * sizeof(T)) <= MAX_KV_LORA_BYTES, + "kv_lora_rank * sizeof(T) must be <= %zu bytes", MAX_KV_LORA_BYTES); + // Check that kv_lora_rank * sizeof(T) is a multiple of 16 + TLLM_CHECK_WITH_INFO((params.kv_lora_rank * sizeof(T)) % 16 == 0, + "kv_lora_rank * sizeof(T) must be a multiple of 16 for async memcpy"); + // Check that cp_size is not larger than the max fallback CP size + TLLM_CHECK_WITH_INFO(params.cp_size <= MAX_CP, "cp_size > fallback max CP size"); + + auto kernel_instance = helix_postprocess_kernel_native; + cudaLaunchConfig_t config; + config.gridDim = dim3(params.num_tokens, params.num_heads); + config.blockDim = WARP_SIZE + params.kv_lora_rank * sizeof(T) / 16; + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = common::getEnvEnablePDL(); + config.numAttrs = 1; + config.attrs = attrs; + TLLM_CUDA_CHECK(cudaLaunchKernelEx(&config, kernel_instance, params.output, params.gathered_o, + params.gathered_stats, params.cp_size, params.kv_lora_rank)); +} + +#define INSTANTIATE_POST_PROC_NATIVE(T) \ + template void helixPostProcessNative(HelixPostProcParams const& params, cudaStream_t stream); + +INSTANTIATE_POST_PROC_NATIVE(__half); +INSTANTIATE_POST_PROC_NATIVE(__nv_bfloat16); + } // namespace kernels TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/helixKernels.h b/cpp/tensorrt_llm/kernels/helixKernels.h index d7b96e32bd..12036438b7 100644 --- a/cpp/tensorrt_llm/kernels/helixKernels.h +++ b/cpp/tensorrt_llm/kernels/helixKernels.h @@ -43,6 +43,9 @@ struct HelixPostProcParams template void helixPostProcess(HelixPostProcParams const& params, cudaStream_t stream); +template +void helixPostProcessNative(HelixPostProcParams const& params, cudaStream_t stream); + } // namespace kernels TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp b/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp index f8425cbade..b0d25e38c9 100644 --- a/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp +++ b/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp @@ -99,14 +99,111 @@ torch::Tensor helix_post_process(torch::Tensor const& gathered_o, torch::Tensor return output; } +template +inline torch::Tensor helix_post_process_native_impl( + torch::Tensor const& gathered_o, torch::Tensor const& gathered_stats, double scale, int cp_dim, Fn fn) +{ + CHECK_TH_CUDA(gathered_o); + CHECK_CONTIGUOUS(gathered_o); + CHECK_TH_CUDA(gathered_stats); + CHECK_CONTIGUOUS(gathered_stats); + + // Only cp_dim=2 is supported + TORCH_CHECK(cp_dim == 2, + "cp_dim must be 2. Expects tensor shapes to be: \n" + "gathered_o: [num_tokens, num_heads, cp_size, kv_lora_rank], \n" + "gathered_stats: [num_tokens, num_heads, cp_size, 2]"); + + // For cp_dim=2: tokens_dim=0, heads_dim=1 + auto tokens_dim = 0; + auto heads_dim = 1; + + TORCH_CHECK(gathered_o.dim() == 4, "gathered_o must be 4D tensor [num_tokens, num_heads, cp_size, kv_lora_rank]"); + TORCH_CHECK(gathered_stats.dim() == 4, "gathered_stats must be 4D tensor [num_tokens, num_heads, cp_size, 2]"); + + auto const num_tokens = gathered_stats.sizes()[tokens_dim]; + auto const num_heads = gathered_stats.sizes()[heads_dim]; + auto const cp_size = gathered_stats.sizes()[2]; + auto const kv_lora_rank = gathered_o.sizes()[3]; + + // check remaining input tensor dimensions + TORCH_CHECK(gathered_o.sizes()[2] == cp_size, "gathered_o cp_size dim must match"); + TORCH_CHECK(gathered_o.sizes()[tokens_dim] == num_tokens, "gathered_o tokens_dim must match num_tokens"); + TORCH_CHECK(gathered_o.sizes()[heads_dim] == num_heads, "gathered_o heads_dim must match num_heads"); + + TORCH_CHECK(gathered_stats.sizes()[3] == 2, "gathered_stats last dimension must be 2"); + + // Check data types + TORCH_CHECK( + gathered_o.scalar_type() == at::ScalarType::Half || gathered_o.scalar_type() == at::ScalarType::BFloat16, + "gathered_o must be half or bfloat16"); + TORCH_CHECK(gathered_stats.scalar_type() == at::ScalarType::Float, "gathered_stats must be float32"); + + // Check alignment requirements for gathered_o (16-byte aligned for async + // memcpy) + TORCH_CHECK(reinterpret_cast(gathered_o.data_ptr()) % 16 == 0, "gathered_o must be 16-byte aligned"); + + // Check that kv_lora_rank * sizeof(data_type) is a multiple of 16 + size_t data_type_size = torch::elementSize(gathered_o.scalar_type()); + TORCH_CHECK((kv_lora_rank * data_type_size) % 16 == 0, "kv_lora_rank * sizeof(data_type) must be a multiple of 16"); + + // Create output tensor + std::vector output_shape = {num_tokens, num_heads * kv_lora_rank}; + torch::Tensor output = torch::empty(output_shape, gathered_o.options()); + + // Get CUDA stream + auto stream = at::cuda::getCurrentCUDAStream(gathered_o.get_device()); + + tensorrt_llm::kernels::HelixPostProcParams params{reinterpret_cast(output.mutable_data_ptr()), + reinterpret_cast(gathered_o.data_ptr()), reinterpret_cast(gathered_stats.data_ptr()), + static_cast(cp_size), static_cast(num_tokens), static_cast(num_heads), + static_cast(kv_lora_rank)}; + fn(params, stream); + + if (scale != 1.0) + { + output *= scale; + } + + return output; +} + +inline torch::Tensor helix_post_process_native( + torch::Tensor const& gathered_o, torch::Tensor const& gathered_stats, double scale, int64_t cp_dim) +{ + TORCH_CHECK(cp_dim == 2, "cp_dim must be 2. Only cp_dim=2 layout is supported."); + if (gathered_o.scalar_type() == at::ScalarType::Half) + { + return helix_post_process_native_impl<__half>( + gathered_o, gathered_stats, scale, int(cp_dim), tensorrt_llm::kernels::helixPostProcessNative<__half>); + } + else if (gathered_o.scalar_type() == at::ScalarType::BFloat16) + { +#ifdef ENABLE_BF16 + return helix_post_process_native_impl<__nv_bfloat16>(gathered_o, gathered_stats, scale, int(cp_dim), + tensorrt_llm::kernels::helixPostProcessNative<__nv_bfloat16>); +#else + TLLM_THROW("BFloat16 must be enabled to use helix_post_process_native with bf16 tensors."); +#endif + } + else + { + TLLM_THROW("helix_post_process_native only supports half and bfloat16 tensors."); + } +} + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("helix_post_process(Tensor gathered_o, Tensor gathered_stats, float scale) -> Tensor"); + m.def( + "helix_post_process_native(Tensor gathered_o, Tensor gathered_stats, float " + "scale, int cp_dim) -> Tensor"); } TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { m.impl("helix_post_process", helix_post_process); + m.impl("helix_post_process_native", &helix_post_process_native); } } // namespace torch_ext diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py index 348e665475..68b114a8d7 100644 --- a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py @@ -756,6 +756,13 @@ def _register_fake(): def _(gathered_o, gathered_stats, scale): return gathered_o.new_empty(*gathered_o.shape[1:]) + @torch.library.register_fake("trtllm::helix_post_process_native") + def _(gathered_o, gathered_stats, scale, cp_dim): + # Remove the dimension at cp_dim (context parallelism dimension) + out_shape = list(gathered_o.shape) + del out_shape[cp_dim] + return gathered_o.new_empty(*out_shape) + @torch.library.register_fake("trtllm::tinygemm2") def _(input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor): # input [M, K], weight [N, K], bias [N] diff --git a/tests/unittest/_torch/thop/parallel/test_helix_postprocess.py b/tests/unittest/_torch/thop/parallel/test_helix_postprocess.py index 7a30e979df..879ddb2b5b 100644 --- a/tests/unittest/_torch/thop/parallel/test_helix_postprocess.py +++ b/tests/unittest/_torch/thop/parallel/test_helix_postprocess.py @@ -22,21 +22,49 @@ from parameterized import parameterized import tensorrt_llm -def baseline(gathered_o, gathered_stats, kv_lora_rank, scale): - """Reference implementation (libtorch)""" - # [cp_size, num_tokens, num_heads] - global_max = gathered_stats[..., 0].max(dim=0, keepdim=True)[0] - # [cp_size, num_tokens, num_heads] - corrected_max = gathered_stats[..., 0] - global_max - corrected_max_exp = torch.exp(corrected_max) - corrected_sum = gathered_stats[..., 1] * corrected_max_exp - global_sum = corrected_sum.sum(dim=0, keepdim=True) - correction = (gathered_stats[..., 1] * corrected_max_exp / global_sum).unsqueeze(-1) - # Cast gathered_o to float32 for computation, then cast output to bf16 at the end - gathered_o_fp32 = gathered_o.to(torch.float32).view(*correction.shape[:-1], kv_lora_rank) - corrected_o = gathered_o_fp32 * correction - # [num_tokens, num_heads * kv_lora_rank] (bf16) - corrected_o = corrected_o.view(*gathered_o.shape[:-1], -1).sum(dim=0) +def baseline(gathered_o, gathered_stats, kv_lora_rank, scale, native=False): + """Reference implementation (libtorch) + + Args: + gathered_o: Input tensor + - native=False: [cp_size, num_tokens, num_heads * kv_lora_rank] + - native=True: [num_tokens, num_heads, cp_size, kv_lora_rank] + gathered_stats: Stats tensor + - native=False: [cp_size, num_tokens, num_heads, 2] + - native=True: [num_tokens, num_heads, cp_size, 2] + kv_lora_rank: KV LoRA rank + scale: Scale factor + native: Whether to use native layout (cp_dim=2) + """ + if native: + # Native layout: cp_dim=2 + # [num_tokens, num_heads, cp_size] + global_max = gathered_stats[..., 0].max(dim=-1, keepdim=True)[0] + corrected_max = gathered_stats[..., 0] - global_max + corrected_max_exp = torch.exp(corrected_max) + corrected_sum = gathered_stats[..., 1] * corrected_max_exp + global_sum = corrected_sum.sum(dim=-1, keepdim=True) + correction = (gathered_stats[..., 1] * corrected_max_exp / global_sum).unsqueeze(-1) + gathered_o_fp32 = gathered_o.to(torch.float32) + corrected_o = gathered_o_fp32 * correction + # Sum over cp_size dimension (dim=2), result: [num_tokens, num_heads, kv_lora_rank] + corrected_o = corrected_o.sum(dim=2) + # Reshape to [num_tokens, num_heads * kv_lora_rank] + corrected_o = corrected_o.view(corrected_o.shape[0], -1) + else: + # Original layout: cp_dim=0 + # [cp_size, num_tokens, num_heads] + global_max = gathered_stats[..., 0].max(dim=0, keepdim=True)[0] + corrected_max = gathered_stats[..., 0] - global_max + corrected_max_exp = torch.exp(corrected_max) + corrected_sum = gathered_stats[..., 1] * corrected_max_exp + global_sum = corrected_sum.sum(dim=0, keepdim=True) + correction = (gathered_stats[..., 1] * corrected_max_exp / global_sum).unsqueeze(-1) + gathered_o_fp32 = gathered_o.to(torch.float32).view(*correction.shape[:-1], kv_lora_rank) + corrected_o = gathered_o_fp32 * correction + # [num_tokens, num_heads * kv_lora_rank] + corrected_o = corrected_o.view(*gathered_o.shape[:-1], -1).sum(dim=0) + return corrected_o.to(gathered_o.dtype) * scale @@ -46,71 +74,134 @@ class TestHelixPostProcess(unittest.TestCase): torch.manual_seed(42) torch.cuda.manual_seed(42) - def _test_helix_postprocess(self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype): - """Test helix postprocessing with given parameters""" + def _test_helix_postprocess( + self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native=False + ): + """Test helix postprocessing with given parameters + + Args: + cp_size: Context parallelism size + num_tokens: Number of tokens + num_heads: Number of attention heads + kv_lora_rank: KV LoRA rank + scale: Scale factor + dtype: Data type (float16 or bfloat16) + native: Whether to use native layout (cp_dim=2) + """ device = torch.device("cuda") - # Create test tensors - # gathered_o_init: [cp_size, num_tokens, num_heads, kv_lora_rank] - gathered_o_init = torch.empty( - cp_size, num_tokens, num_heads, kv_lora_rank, dtype=dtype, device=device - ).uniform_(-1, 1) + if native: + # Native layout: [num_tokens, num_heads, cp_size, kv_lora_rank] + gathered_o = torch.empty( + num_tokens, num_heads, cp_size, kv_lora_rank, dtype=dtype, device=device + ).uniform_(-1, 1) + # gathered_stats: [num_tokens, num_heads, cp_size, 2] + gathered_stats = torch.empty( + num_tokens, num_heads, cp_size, 2, dtype=torch.float32, device=device + ) + gathered_o_max = torch.max(gathered_o, dim=-1, keepdim=True)[0] + gathered_stats[..., 0] = gathered_o_max[..., 0] + gathered_o_sum = torch.sum(torch.exp(gathered_o - gathered_o_max), dim=-1) + gathered_stats[..., 1] = gathered_o_sum - # gathered_stats: [cp_size, num_tokens, num_heads, 2] - gathered_stats = torch.empty( - cp_size, num_tokens, num_heads, 2, dtype=torch.float32, device=device - ) - gathered_o_max = torch.max(gathered_o_init, dim=-1, keepdim=True)[0] - gathered_stats[..., 0] = gathered_o_max[..., 0] - gathered_o_sum = torch.sum(torch.exp(gathered_o_init - gathered_o_max), dim=-1) - gathered_stats[..., 1] = gathered_o_sum + # Call the custom operator with cp_dim=2 + output = torch.ops.trtllm.helix_post_process_native( + gathered_o, gathered_stats, scale, 2 + ) + else: + # Original layout: [cp_size, num_tokens, num_heads, kv_lora_rank] + gathered_o_init = torch.empty( + cp_size, num_tokens, num_heads, kv_lora_rank, dtype=dtype, device=device + ).uniform_(-1, 1) + # gathered_stats: [cp_size, num_tokens, num_heads, 2] + gathered_stats = torch.empty( + cp_size, num_tokens, num_heads, 2, dtype=torch.float32, device=device + ) + gathered_o_max = torch.max(gathered_o_init, dim=-1, keepdim=True)[0] + gathered_stats[..., 0] = gathered_o_max[..., 0] + gathered_o_sum = torch.sum(torch.exp(gathered_o_init - gathered_o_max), dim=-1) + gathered_stats[..., 1] = gathered_o_sum - gathered_o = gathered_o_init.view(cp_size, num_tokens, num_heads * kv_lora_rank) + gathered_o = gathered_o_init.view(cp_size, num_tokens, num_heads * kv_lora_rank) - # Call the custom operator - output = torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, scale) + # Call the custom operator + output = torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, scale) # Compute baseline - expected_output = baseline(gathered_o, gathered_stats, kv_lora_rank, scale) + expected_output = baseline(gathered_o, gathered_stats, kv_lora_rank, scale, native=native) # Compare results torch.testing.assert_close(output, expected_output, atol=1e-3, rtol=1e-2) @parameterized.expand( [ - # (cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype) - (4, 8, 2, 64, 1.0, torch.float16), - (8, 16, 4, 128, 0.5, torch.float16), - (16, 32, 8, 256, 2.0, torch.float16), - (4, 8, 2, 64, 1.0, torch.bfloat16), - (8, 16, 4, 128, 0.5, torch.bfloat16), - (16, 32, 8, 256, 2.0, torch.bfloat16), + # (cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native) + (4, 8, 2, 64, 1.0, torch.float16, False), + (8, 16, 4, 128, 0.5, torch.float16, False), + (16, 32, 8, 256, 2.0, torch.float16, False), + (4, 8, 2, 64, 1.0, torch.bfloat16, False), + (8, 16, 4, 128, 0.5, torch.bfloat16, False), + (16, 32, 8, 256, 2.0, torch.bfloat16, False), + (4, 8, 2, 64, 1.0, torch.float16, True), + (8, 16, 4, 128, 0.5, torch.float16, True), + (16, 32, 8, 256, 2.0, torch.float16, True), + (4, 8, 2, 64, 1.0, torch.bfloat16, True), + (8, 16, 4, 128, 0.5, torch.bfloat16, True), + (16, 32, 8, 256, 2.0, torch.bfloat16, True), ] ) def test_helix_postprocess_basic( - self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype + self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native ): """Test basic helix postprocessing functionality""" - self._test_helix_postprocess(cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype) + self._test_helix_postprocess( + cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native + ) @parameterized.expand( [ - # Test edge cases - (1, 1, 1, 16, 1.0, torch.float16), # Minimal sizes - (256, 1, 1, 16, 1.0, torch.float16), # Max cp_size - (128, 1, 1, 16, 1.0, torch.float16), # Single token - (4, 8, 1, 16, 1.0, torch.float16), # Single head - (4, 8, 2, 2048, 1.0, torch.float16), # Large kv_lora_rank + # (cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native) + # Edge cases for non-native layout + (1, 1, 1, 16, 1.0, torch.float16, False), # Minimal sizes + (256, 1, 1, 16, 1.0, torch.float16, False), # Max cp_size + (128, 1, 1, 16, 1.0, torch.float16, False), # Single token + (4, 8, 1, 16, 1.0, torch.float16, False), # Single head + (4, 8, 2, 2048, 1.0, torch.float16, False), # Large kv_lora_rank + # Edge cases for native layout + (1, 1, 1, 16, 1.0, torch.float16, True), # Minimal sizes + (256, 1, 1, 16, 1.0, torch.float16, True), # Max cp_size + (128, 1, 1, 16, 1.0, torch.float16, True), # Single token + (4, 8, 1, 16, 1.0, torch.float16, True), # Single head + # Note: Large kv_lora_rank (2048) exceeds MAX_KV_LORA_BYTES for native kernel ] ) def test_helix_postprocess_edge_cases( - self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype + self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native ): """Test edge cases with minimal dimensions""" - self._test_helix_postprocess(cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype) + self._test_helix_postprocess( + cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native + ) + + @parameterized.expand( + [ + # (cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native) + (16, 16, 64, 512, 1.0, torch.float16, False), + (16, 16, 64, 512, 1.0, torch.bfloat16, False), + (16, 16, 64, 512, 1.0, torch.float16, True), + (16, 16, 64, 512, 1.0, torch.bfloat16, True), + ] + ) + def test_helix_postprocess_large_inputs( + self, cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native + ): + """Test with larger inputs to ensure performance and correctness""" + self._test_helix_postprocess( + cp_size, num_tokens, num_heads, kv_lora_rank, scale, dtype, native + ) def test_helix_postprocess_invalid_inputs(self): - """Test error handling for invalid inputs""" + """Test error handling for invalid inputs (non-native)""" device = torch.device("cuda") # Test with wrong tensor dimensions @@ -137,34 +228,83 @@ class TestHelixPostProcess(unittest.TestCase): with pytest.raises(RuntimeError): torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, 1.0) - def test_helix_postprocess_alignment_requirements(self): + def test_helix_postprocess_native_invalid_inputs(self): + """Test error handling for invalid inputs (native layout)""" + device = torch.device("cuda") + + # Test with wrong cp_dim (only cp_dim=2 is supported) + gathered_o = torch.randn(8, 2, 4, 64, dtype=torch.float16, device=device) + gathered_stats = torch.randn(8, 2, 4, 2, dtype=torch.float32, device=device) + + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 0) + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 1) + + # Test with wrong tensor dimensions (3D instead of 4D) + gathered_o = torch.randn(8, 2, 256, dtype=torch.float16, device=device) + gathered_stats = torch.randn(8, 2, 4, 2, dtype=torch.float32, device=device) + + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 2) + + # Test with wrong data types + gathered_o = torch.randn(8, 2, 4, 64, dtype=torch.float32, device=device) + gathered_stats = torch.randn(8, 2, 4, 2, dtype=torch.float32, device=device) + + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 2) + + # Test with non-contiguous tensors + gathered_o = torch.randn(8, 2, 4, 64, dtype=torch.float16, device=device).transpose(0, 1) + gathered_stats = torch.randn(8, 2, 4, 2, dtype=torch.float32, device=device) + + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 2) + + @parameterized.expand( + [ + # (native,) + (False,), + (True,), + ] + ) + def test_helix_postprocess_alignment_requirements(self, native): """Test alignment requirements""" device = torch.device("cuda") - # Test with kv_lora_rank that doesn't satisfy alignment requirements # For float16 (2 bytes), kv_lora_rank must be multiple of 8 for 16-byte alignment - # For bfloat16 (2 bytes), kv_lora_rank must be multiple of 8 for 16-byte alignment - # This should work (kv_lora_rank = 64 is multiple of 8) - gathered_o = torch.randn(4, 8, 2 * 64, dtype=torch.float16, device=device) - gathered_stats = torch.randn(4, 8, 2, 2, dtype=torch.float32, device=device) + if native: + # This should work (kv_lora_rank = 64 is multiple of 8) + gathered_o = torch.randn(8, 2, 4, 64, dtype=torch.float16, device=device) + gathered_stats = torch.randn(8, 2, 4, 2, dtype=torch.float32, device=device) - try: - torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, 1.0) - # Should not raise an error - except RuntimeError as e: - pytest.fail(f"Should not raise error for valid alignment: {e}") + try: + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 2) + except RuntimeError as e: + pytest.fail(f"Should not raise error for valid alignment: {e}") - # Test with kv_lora_rank that doesn't satisfy alignment requirements - gathered_o = torch.randn(4, 8, 4, dtype=torch.float16, device=device) - gathered_stats = torch.randn(4, 8, 1, 2, dtype=torch.float32, device=device) - with pytest.raises(RuntimeError): - torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, 1.0) + # Test with kv_lora_rank that doesn't satisfy alignment requirements + gathered_o = torch.randn(8, 1, 4, 4, dtype=torch.float16, device=device) + gathered_stats = torch.randn(8, 1, 4, 2, dtype=torch.float32, device=device) + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process_native(gathered_o, gathered_stats, 1.0, 2) + else: + # This should work (kv_lora_rank = 64 is multiple of 8) + gathered_o = torch.randn(4, 8, 2 * 64, dtype=torch.float16, device=device) + gathered_stats = torch.randn(4, 8, 2, 2, dtype=torch.float32, device=device) - def test_helix_postprocess_large_inputs(self): - """Test with larger inputs to ensure performance and correctness""" - self._test_helix_postprocess(16, 16, 64, 512, 1.0, torch.float16) - self._test_helix_postprocess(16, 16, 64, 512, 1.0, torch.bfloat16) + try: + torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, 1.0) + except RuntimeError as e: + pytest.fail(f"Should not raise error for valid alignment: {e}") + + # Test with kv_lora_rank that doesn't satisfy alignment requirements + gathered_o = torch.randn(4, 8, 4, dtype=torch.float16, device=device) + gathered_stats = torch.randn(4, 8, 1, 2, dtype=torch.float32, device=device) + with pytest.raises(RuntimeError): + torch.ops.trtllm.helix_post_process(gathered_o, gathered_stats, 1.0) if __name__ == "__main__": From e4e09867d1e9b76868b5d9af9d1198efcb5d599f Mon Sep 17 00:00:00 2001 From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> Date: Sat, 13 Dec 2025 03:26:42 +0000 Subject: [PATCH 110/172] [None][infra] Check in most recent lock file from nightly pipeline Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> --- .../examples/draft_target_model/poetry.lock | 6 +- security_scanning/examples/eagle/poetry.lock | 6 +- .../examples/lookahead/poetry.lock | 6 +- security_scanning/examples/medusa/poetry.lock | 6 +- .../models/contrib/baichuan/poetry.lock | 6 +- .../examples/models/contrib/bloom/poetry.lock | 6 +- .../models/contrib/chatglm-6b/poetry.lock | 6 +- .../models/contrib/chatglm2-6b/poetry.lock | 6 +- .../contrib/chatglm3-6b-32k/poetry.lock | 6 +- .../examples/models/contrib/dbrx/poetry.lock | 6 +- .../models/contrib/deepseek_v1/poetry.lock | 6 +- .../models/contrib/deepseek_v2/poetry.lock | 6 +- .../examples/models/contrib/gptj/poetry.lock | 6 +- .../models/contrib/gptneox/poetry.lock | 6 +- .../examples/models/contrib/grok/poetry.lock | 6 +- .../models/contrib/hyperclovax/poetry.lock | 6 +- .../models/contrib/internlm/poetry.lock | 6 +- .../examples/models/contrib/jais/poetry.lock | 6 +- .../examples/models/contrib/mpt/poetry.lock | 6 +- .../examples/models/contrib/opt/poetry.lock | 6 +- .../models/contrib/skywork/poetry.lock | 6 +- .../examples/models/contrib/smaug/poetry.lock | 6 +- .../examples/models/core/commandr/poetry.lock | 6 +- .../examples/models/core/gemma/poetry.lock | 6 +- .../examples/models/core/glm-4-9b/poetry.lock | 6 +- .../examples/models/core/gpt/poetry.lock | 6 +- .../examples/models/core/nemotron/poetry.lock | 6 +- .../examples/models/core/phi/poetry.lock | 6 +- .../examples/models/core/qwen/poetry.lock | 108 +++++++++--------- .../examples/models/core/qwenvl/poetry.lock | 108 +++++++++--------- security_scanning/examples/ngram/poetry.lock | 6 +- .../examples/quantization/poetry.lock | 6 +- .../examples/ray_orchestrator/poetry.lock | 6 +- .../examples/redrafter/poetry.lock | 6 +- security_scanning/metadata.json | 4 +- .../tests/integration/defs/perf/poetry.lock | 102 ++++++++--------- 36 files changed, 257 insertions(+), 257 deletions(-) diff --git a/security_scanning/examples/draft_target_model/poetry.lock b/security_scanning/examples/draft_target_model/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/draft_target_model/poetry.lock +++ b/security_scanning/examples/draft_target_model/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/eagle/poetry.lock b/security_scanning/examples/eagle/poetry.lock index a47fe4162c..ad5f201ef1 100644 --- a/security_scanning/examples/eagle/poetry.lock +++ b/security_scanning/examples/eagle/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/lookahead/poetry.lock b/security_scanning/examples/lookahead/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/lookahead/poetry.lock +++ b/security_scanning/examples/lookahead/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/medusa/poetry.lock b/security_scanning/examples/medusa/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/medusa/poetry.lock +++ b/security_scanning/examples/medusa/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/baichuan/poetry.lock b/security_scanning/examples/models/contrib/baichuan/poetry.lock index 40c6d1c314..803be3fd10 100644 --- a/security_scanning/examples/models/contrib/baichuan/poetry.lock +++ b/security_scanning/examples/models/contrib/baichuan/poetry.lock @@ -781,13 +781,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/bloom/poetry.lock b/security_scanning/examples/models/contrib/bloom/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/models/contrib/bloom/poetry.lock +++ b/security_scanning/examples/models/contrib/bloom/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock index afb3d9ddf4..63cdf97138 100644 --- a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock index afb3d9ddf4..63cdf97138 100644 --- a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock index afb3d9ddf4..63cdf97138 100644 --- a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/dbrx/poetry.lock b/security_scanning/examples/models/contrib/dbrx/poetry.lock index e6494c0704..55afe0d7b0 100644 --- a/security_scanning/examples/models/contrib/dbrx/poetry.lock +++ b/security_scanning/examples/models/contrib/dbrx/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock index 84666def73..2f76be099e 100644 --- a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/gptj/poetry.lock b/security_scanning/examples/models/contrib/gptj/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/contrib/gptj/poetry.lock +++ b/security_scanning/examples/models/contrib/gptj/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/gptneox/poetry.lock b/security_scanning/examples/models/contrib/gptneox/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/contrib/gptneox/poetry.lock +++ b/security_scanning/examples/models/contrib/gptneox/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/grok/poetry.lock b/security_scanning/examples/models/contrib/grok/poetry.lock index 4d08baf665..7bda39c20b 100644 --- a/security_scanning/examples/models/contrib/grok/poetry.lock +++ b/security_scanning/examples/models/contrib/grok/poetry.lock @@ -881,13 +881,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock index 9b7fb57a32..1773f094b5 100644 --- a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock +++ b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock @@ -290,13 +290,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/internlm/poetry.lock b/security_scanning/examples/models/contrib/internlm/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/models/contrib/internlm/poetry.lock +++ b/security_scanning/examples/models/contrib/internlm/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/jais/poetry.lock b/security_scanning/examples/models/contrib/jais/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/models/contrib/jais/poetry.lock +++ b/security_scanning/examples/models/contrib/jais/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/mpt/poetry.lock b/security_scanning/examples/models/contrib/mpt/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/contrib/mpt/poetry.lock +++ b/security_scanning/examples/models/contrib/mpt/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/opt/poetry.lock b/security_scanning/examples/models/contrib/opt/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/contrib/opt/poetry.lock +++ b/security_scanning/examples/models/contrib/opt/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/skywork/poetry.lock b/security_scanning/examples/models/contrib/skywork/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/models/contrib/skywork/poetry.lock +++ b/security_scanning/examples/models/contrib/skywork/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/smaug/poetry.lock b/security_scanning/examples/models/contrib/smaug/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/models/contrib/smaug/poetry.lock +++ b/security_scanning/examples/models/contrib/smaug/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/commandr/poetry.lock b/security_scanning/examples/models/core/commandr/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/core/commandr/poetry.lock +++ b/security_scanning/examples/models/core/commandr/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/gemma/poetry.lock b/security_scanning/examples/models/core/gemma/poetry.lock index 03d2582047..6e4a158c8c 100644 --- a/security_scanning/examples/models/core/gemma/poetry.lock +++ b/security_scanning/examples/models/core/gemma/poetry.lock @@ -872,13 +872,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/glm-4-9b/poetry.lock b/security_scanning/examples/models/core/glm-4-9b/poetry.lock index afb3d9ddf4..63cdf97138 100644 --- a/security_scanning/examples/models/core/glm-4-9b/poetry.lock +++ b/security_scanning/examples/models/core/glm-4-9b/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/gpt/poetry.lock b/security_scanning/examples/models/core/gpt/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/models/core/gpt/poetry.lock +++ b/security_scanning/examples/models/core/gpt/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/nemotron/poetry.lock b/security_scanning/examples/models/core/nemotron/poetry.lock index e30b8b936c..ddb42f46da 100644 --- a/security_scanning/examples/models/core/nemotron/poetry.lock +++ b/security_scanning/examples/models/core/nemotron/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/phi/poetry.lock b/security_scanning/examples/models/core/phi/poetry.lock index a5ab5081cd..8aace7f280 100644 --- a/security_scanning/examples/models/core/phi/poetry.lock +++ b/security_scanning/examples/models/core/phi/poetry.lock @@ -782,13 +782,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock index 4bbf2cccc1..e0d4600e97 100644 --- a/security_scanning/examples/models/core/qwen/poetry.lock +++ b/security_scanning/examples/models/core/qwen/poetry.lock @@ -652,13 +652,13 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.124.2" +version = "0.124.4" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" files = [ - {file = "fastapi-0.124.2-py3-none-any.whl", hash = "sha256:6314385777a507bb19b34bd064829fddaea0eea54436deb632b5de587554055c"}, - {file = "fastapi-0.124.2.tar.gz", hash = "sha256:72e188f01f360e2f59da51c8822cbe4bca210c35daaae6321b1b724109101c00"}, + {file = "fastapi-0.124.4-py3-none-any.whl", hash = "sha256:6d1e703698443ccb89e50abe4893f3c84d9d6689c0cf1ca4fad6d3c15cf69f15"}, + {file = "fastapi-0.124.4.tar.gz", hash = "sha256:0e9422e8d6b797515f33f500309f6e1c98ee4e85563ba0f2debb282df6343763"}, ] [package.dependencies] @@ -699,61 +699,61 @@ files = [ [[package]] name = "fonttools" -version = "4.61.0" +version = "4.61.1" description = "Tools to manipulate font files" optional = false python-versions = ">=3.10" files = [ - {file = "fonttools-4.61.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dc25a4a9c1225653e4431a9413d0381b1c62317b0f543bdcec24e1991f612f33"}, - {file = "fonttools-4.61.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b493c32d2555e9944ec1b911ea649ff8f01a649ad9cba6c118d6798e932b3f0"}, - {file = "fonttools-4.61.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad751319dc532a79bdf628b8439af167181b4210a0cd28a8935ca615d9fdd727"}, - {file = "fonttools-4.61.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2de14557d113faa5fb519f7f29c3abe4d69c17fe6a5a2595cc8cda7338029219"}, - {file = "fonttools-4.61.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:59587bbe455dbdf75354a9dbca1697a35a8903e01fab4248d6b98a17032cee52"}, - {file = "fonttools-4.61.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:46cb3d9279f758ac0cf671dc3482da877104b65682679f01b246515db03dbb72"}, - {file = "fonttools-4.61.0-cp310-cp310-win32.whl", hash = "sha256:58b4f1b78dfbfe855bb8a6801b31b8cdcca0e2847ec769ad8e0b0b692832dd3b"}, - {file = "fonttools-4.61.0-cp310-cp310-win_amd64.whl", hash = "sha256:68704a8bbe0b61976262b255e90cde593dc0fe3676542d9b4d846bad2a890a76"}, - {file = "fonttools-4.61.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a32a16951cbf113d38f1dd8551b277b6e06e0f6f776fece0f99f746d739e1be3"}, - {file = "fonttools-4.61.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:328a9c227984bebaf69f3ac9062265f8f6acc7ddf2e4e344c63358579af0aa3d"}, - {file = "fonttools-4.61.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f0bafc8a3b3749c69cc610e5aa3da832d39c2a37a68f03d18ec9a02ecaac04a"}, - {file = "fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b5ca59b7417d149cf24e4c1933c9f44b2957424fc03536f132346d5242e0ebe5"}, - {file = "fonttools-4.61.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:df8cbce85cf482eb01f4551edca978c719f099c623277bda8332e5dbe7dba09d"}, - {file = "fonttools-4.61.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7fb5b84f48a6a733ca3d7f41aa9551908ccabe8669ffe79586560abcc00a9cfd"}, - {file = "fonttools-4.61.0-cp311-cp311-win32.whl", hash = "sha256:787ef9dfd1ea9fe49573c272412ae5f479d78e671981819538143bec65863865"}, - {file = "fonttools-4.61.0-cp311-cp311-win_amd64.whl", hash = "sha256:14fafda386377b6131d9e448af42d0926bad47e038de0e5ba1d58c25d621f028"}, - {file = "fonttools-4.61.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e24a1565c4e57111ec7f4915f8981ecbb61adf66a55f378fdc00e206059fcfef"}, - {file = "fonttools-4.61.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e2bfacb5351303cae9f072ccf3fc6ecb437a6f359c0606bae4b1ab6715201d87"}, - {file = "fonttools-4.61.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0bdcf2e29d65c26299cc3d502f4612365e8b90a939f46cd92d037b6cb7bb544a"}, - {file = "fonttools-4.61.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e6cd0d9051b8ddaf7385f99dd82ec2a058e2b46cf1f1961e68e1ff20fcbb61af"}, - {file = "fonttools-4.61.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e074bc07c31406f45c418e17c1722e83560f181d122c412fa9e815df0ff74810"}, - {file = "fonttools-4.61.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a9b78da5d5faa17e63b2404b77feeae105c1b7e75f26020ab7a27b76e02039f"}, - {file = "fonttools-4.61.0-cp312-cp312-win32.whl", hash = "sha256:9821ed77bb676736b88fa87a737c97b6af06e8109667e625a4f00158540ce044"}, - {file = "fonttools-4.61.0-cp312-cp312-win_amd64.whl", hash = "sha256:0011d640afa61053bc6590f9a3394bd222de7cfde19346588beabac374e9d8ac"}, - {file = "fonttools-4.61.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba774b8cbd8754f54b8eb58124e8bd45f736b2743325ab1a5229698942b9b433"}, - {file = "fonttools-4.61.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c84b430616ed73ce46e9cafd0bf0800e366a3e02fb7e1ad7c1e214dbe3862b1f"}, - {file = "fonttools-4.61.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2b734d8391afe3c682320840c8191de9bd24e7eb85768dd4dc06ed1b63dbb1b"}, - {file = "fonttools-4.61.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a5c5fff72bf31b0e558ed085e4fd7ed96eb85881404ecc39ed2a779e7cf724eb"}, - {file = "fonttools-4.61.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:14a290c5c93fcab76b7f451e6a4b7721b712d90b3b5ed6908f1abcf794e90d6d"}, - {file = "fonttools-4.61.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:13e3e20a5463bfeb77b3557d04b30bd6a96a6bb5c15c7b2e7908903e69d437a0"}, - {file = "fonttools-4.61.0-cp313-cp313-win32.whl", hash = "sha256:6781e7a4bb010be1cd69a29927b0305c86b843395f2613bdabe115f7d6ea7f34"}, - {file = "fonttools-4.61.0-cp313-cp313-win_amd64.whl", hash = "sha256:c53b47834ae41e8e4829171cc44fec0fdf125545a15f6da41776b926b9645a9a"}, - {file = "fonttools-4.61.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:96dfc9bc1f2302224e48e6ee37e656eddbab810b724b52e9d9c13a57a6abad01"}, - {file = "fonttools-4.61.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3b2065d94e5d63aafc2591c8b6ccbdb511001d9619f1bca8ad39b745ebeb5efa"}, - {file = "fonttools-4.61.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e0d87e81e4d869549585ba0beb3f033718501c1095004f5e6aef598d13ebc216"}, - {file = "fonttools-4.61.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1cfa2eb9bae650e58f0e8ad53c49d19a844d6034d6b259f30f197238abc1ccee"}, - {file = "fonttools-4.61.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4238120002e68296d55e091411c09eab94e111c8ce64716d17df53fd0eb3bb3d"}, - {file = "fonttools-4.61.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b6ceac262cc62bec01b3bb59abccf41b24ef6580869e306a4e88b7e56bb4bdda"}, - {file = "fonttools-4.61.0-cp314-cp314-win32.whl", hash = "sha256:adbb4ecee1a779469a77377bbe490565effe8fce6fb2e6f95f064de58f8bac85"}, - {file = "fonttools-4.61.0-cp314-cp314-win_amd64.whl", hash = "sha256:02bdf8e04d1a70476564b8640380f04bb4ac74edc1fc71f1bacb840b3e398ee9"}, - {file = "fonttools-4.61.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:627216062d90ab0d98215176d8b9562c4dd5b61271d35f130bcd30f6a8aaa33a"}, - {file = "fonttools-4.61.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:7b446623c9cd5f14a59493818eaa80255eec2468c27d2c01b56e05357c263195"}, - {file = "fonttools-4.61.0-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:70e2a0c0182ee75e493ef33061bfebf140ea57e035481d2f95aa03b66c7a0e05"}, - {file = "fonttools-4.61.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9064b0f55b947e929ac669af5311ab1f26f750214db6dd9a0c97e091e918f486"}, - {file = "fonttools-4.61.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cb5e45a824ce14b90510024d0d39dae51bd4fbb54c42a9334ea8c8cf4d95cbe"}, - {file = "fonttools-4.61.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6e5ca8c62efdec7972dfdfd454415c4db49b89aeaefaaacada432f3b7eea9866"}, - {file = "fonttools-4.61.0-cp314-cp314t-win32.whl", hash = "sha256:63c7125d31abe3e61d7bb917329b5543c5b3448db95f24081a13aaf064360fc8"}, - {file = "fonttools-4.61.0-cp314-cp314t-win_amd64.whl", hash = "sha256:67d841aa272be5500de7f447c40d1d8452783af33b4c3599899319f6ef9ad3c1"}, - {file = "fonttools-4.61.0-py3-none-any.whl", hash = "sha256:276f14c560e6f98d24ef7f5f44438e55ff5a67f78fa85236b218462c9f5d0635"}, - {file = "fonttools-4.61.0.tar.gz", hash = "sha256:ec520a1f0c7758d7a858a00f090c1745f6cde6a7c5e76fb70ea4044a15f712e7"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"}, + {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"}, + {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"}, + {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"}, + {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"}, + {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"}, + {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"}, + {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"}, + {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"}, + {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"}, + {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"}, + {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"}, + {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"}, + {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"}, + {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwenvl/poetry.lock b/security_scanning/examples/models/core/qwenvl/poetry.lock index 9c7ccf40af..a9b9e21c44 100644 --- a/security_scanning/examples/models/core/qwenvl/poetry.lock +++ b/security_scanning/examples/models/core/qwenvl/poetry.lock @@ -602,61 +602,61 @@ files = [ [[package]] name = "fonttools" -version = "4.61.0" +version = "4.61.1" description = "Tools to manipulate font files" optional = false python-versions = ">=3.10" files = [ - {file = "fonttools-4.61.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dc25a4a9c1225653e4431a9413d0381b1c62317b0f543bdcec24e1991f612f33"}, - {file = "fonttools-4.61.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b493c32d2555e9944ec1b911ea649ff8f01a649ad9cba6c118d6798e932b3f0"}, - {file = "fonttools-4.61.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad751319dc532a79bdf628b8439af167181b4210a0cd28a8935ca615d9fdd727"}, - {file = "fonttools-4.61.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2de14557d113faa5fb519f7f29c3abe4d69c17fe6a5a2595cc8cda7338029219"}, - {file = "fonttools-4.61.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:59587bbe455dbdf75354a9dbca1697a35a8903e01fab4248d6b98a17032cee52"}, - {file = "fonttools-4.61.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:46cb3d9279f758ac0cf671dc3482da877104b65682679f01b246515db03dbb72"}, - {file = "fonttools-4.61.0-cp310-cp310-win32.whl", hash = "sha256:58b4f1b78dfbfe855bb8a6801b31b8cdcca0e2847ec769ad8e0b0b692832dd3b"}, - {file = "fonttools-4.61.0-cp310-cp310-win_amd64.whl", hash = "sha256:68704a8bbe0b61976262b255e90cde593dc0fe3676542d9b4d846bad2a890a76"}, - {file = "fonttools-4.61.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a32a16951cbf113d38f1dd8551b277b6e06e0f6f776fece0f99f746d739e1be3"}, - {file = "fonttools-4.61.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:328a9c227984bebaf69f3ac9062265f8f6acc7ddf2e4e344c63358579af0aa3d"}, - {file = "fonttools-4.61.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f0bafc8a3b3749c69cc610e5aa3da832d39c2a37a68f03d18ec9a02ecaac04a"}, - {file = "fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b5ca59b7417d149cf24e4c1933c9f44b2957424fc03536f132346d5242e0ebe5"}, - {file = "fonttools-4.61.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:df8cbce85cf482eb01f4551edca978c719f099c623277bda8332e5dbe7dba09d"}, - {file = "fonttools-4.61.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7fb5b84f48a6a733ca3d7f41aa9551908ccabe8669ffe79586560abcc00a9cfd"}, - {file = "fonttools-4.61.0-cp311-cp311-win32.whl", hash = "sha256:787ef9dfd1ea9fe49573c272412ae5f479d78e671981819538143bec65863865"}, - {file = "fonttools-4.61.0-cp311-cp311-win_amd64.whl", hash = "sha256:14fafda386377b6131d9e448af42d0926bad47e038de0e5ba1d58c25d621f028"}, - {file = "fonttools-4.61.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e24a1565c4e57111ec7f4915f8981ecbb61adf66a55f378fdc00e206059fcfef"}, - {file = "fonttools-4.61.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e2bfacb5351303cae9f072ccf3fc6ecb437a6f359c0606bae4b1ab6715201d87"}, - {file = "fonttools-4.61.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0bdcf2e29d65c26299cc3d502f4612365e8b90a939f46cd92d037b6cb7bb544a"}, - {file = "fonttools-4.61.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e6cd0d9051b8ddaf7385f99dd82ec2a058e2b46cf1f1961e68e1ff20fcbb61af"}, - {file = "fonttools-4.61.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e074bc07c31406f45c418e17c1722e83560f181d122c412fa9e815df0ff74810"}, - {file = "fonttools-4.61.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a9b78da5d5faa17e63b2404b77feeae105c1b7e75f26020ab7a27b76e02039f"}, - {file = "fonttools-4.61.0-cp312-cp312-win32.whl", hash = "sha256:9821ed77bb676736b88fa87a737c97b6af06e8109667e625a4f00158540ce044"}, - {file = "fonttools-4.61.0-cp312-cp312-win_amd64.whl", hash = "sha256:0011d640afa61053bc6590f9a3394bd222de7cfde19346588beabac374e9d8ac"}, - {file = "fonttools-4.61.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba774b8cbd8754f54b8eb58124e8bd45f736b2743325ab1a5229698942b9b433"}, - {file = "fonttools-4.61.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c84b430616ed73ce46e9cafd0bf0800e366a3e02fb7e1ad7c1e214dbe3862b1f"}, - {file = "fonttools-4.61.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2b734d8391afe3c682320840c8191de9bd24e7eb85768dd4dc06ed1b63dbb1b"}, - {file = "fonttools-4.61.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a5c5fff72bf31b0e558ed085e4fd7ed96eb85881404ecc39ed2a779e7cf724eb"}, - {file = "fonttools-4.61.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:14a290c5c93fcab76b7f451e6a4b7721b712d90b3b5ed6908f1abcf794e90d6d"}, - {file = "fonttools-4.61.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:13e3e20a5463bfeb77b3557d04b30bd6a96a6bb5c15c7b2e7908903e69d437a0"}, - {file = "fonttools-4.61.0-cp313-cp313-win32.whl", hash = "sha256:6781e7a4bb010be1cd69a29927b0305c86b843395f2613bdabe115f7d6ea7f34"}, - {file = "fonttools-4.61.0-cp313-cp313-win_amd64.whl", hash = "sha256:c53b47834ae41e8e4829171cc44fec0fdf125545a15f6da41776b926b9645a9a"}, - {file = "fonttools-4.61.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:96dfc9bc1f2302224e48e6ee37e656eddbab810b724b52e9d9c13a57a6abad01"}, - {file = "fonttools-4.61.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3b2065d94e5d63aafc2591c8b6ccbdb511001d9619f1bca8ad39b745ebeb5efa"}, - {file = "fonttools-4.61.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e0d87e81e4d869549585ba0beb3f033718501c1095004f5e6aef598d13ebc216"}, - {file = "fonttools-4.61.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1cfa2eb9bae650e58f0e8ad53c49d19a844d6034d6b259f30f197238abc1ccee"}, - {file = "fonttools-4.61.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4238120002e68296d55e091411c09eab94e111c8ce64716d17df53fd0eb3bb3d"}, - {file = "fonttools-4.61.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b6ceac262cc62bec01b3bb59abccf41b24ef6580869e306a4e88b7e56bb4bdda"}, - {file = "fonttools-4.61.0-cp314-cp314-win32.whl", hash = "sha256:adbb4ecee1a779469a77377bbe490565effe8fce6fb2e6f95f064de58f8bac85"}, - {file = "fonttools-4.61.0-cp314-cp314-win_amd64.whl", hash = "sha256:02bdf8e04d1a70476564b8640380f04bb4ac74edc1fc71f1bacb840b3e398ee9"}, - {file = "fonttools-4.61.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:627216062d90ab0d98215176d8b9562c4dd5b61271d35f130bcd30f6a8aaa33a"}, - {file = "fonttools-4.61.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:7b446623c9cd5f14a59493818eaa80255eec2468c27d2c01b56e05357c263195"}, - {file = "fonttools-4.61.0-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:70e2a0c0182ee75e493ef33061bfebf140ea57e035481d2f95aa03b66c7a0e05"}, - {file = "fonttools-4.61.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9064b0f55b947e929ac669af5311ab1f26f750214db6dd9a0c97e091e918f486"}, - {file = "fonttools-4.61.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cb5e45a824ce14b90510024d0d39dae51bd4fbb54c42a9334ea8c8cf4d95cbe"}, - {file = "fonttools-4.61.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6e5ca8c62efdec7972dfdfd454415c4db49b89aeaefaaacada432f3b7eea9866"}, - {file = "fonttools-4.61.0-cp314-cp314t-win32.whl", hash = "sha256:63c7125d31abe3e61d7bb917329b5543c5b3448db95f24081a13aaf064360fc8"}, - {file = "fonttools-4.61.0-cp314-cp314t-win_amd64.whl", hash = "sha256:67d841aa272be5500de7f447c40d1d8452783af33b4c3599899319f6ef9ad3c1"}, - {file = "fonttools-4.61.0-py3-none-any.whl", hash = "sha256:276f14c560e6f98d24ef7f5f44438e55ff5a67f78fa85236b218462c9f5d0635"}, - {file = "fonttools-4.61.0.tar.gz", hash = "sha256:ec520a1f0c7758d7a858a00f090c1745f6cde6a7c5e76fb70ea4044a15f712e7"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"}, + {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"}, + {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"}, + {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"}, + {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"}, + {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"}, + {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"}, + {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"}, + {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"}, + {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"}, + {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"}, + {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"}, + {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"}, + {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"}, + {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"}, ] [package.extras] @@ -945,13 +945,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/ngram/poetry.lock b/security_scanning/examples/ngram/poetry.lock index 88fb476241..621b416a3d 100644 --- a/security_scanning/examples/ngram/poetry.lock +++ b/security_scanning/examples/ngram/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/quantization/poetry.lock b/security_scanning/examples/quantization/poetry.lock index aa13aec255..e5647a7b82 100644 --- a/security_scanning/examples/quantization/poetry.lock +++ b/security_scanning/examples/quantization/poetry.lock @@ -736,13 +736,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/examples/ray_orchestrator/poetry.lock b/security_scanning/examples/ray_orchestrator/poetry.lock index 7bfa4210c9..a0e8d1cb4f 100644 --- a/security_scanning/examples/ray_orchestrator/poetry.lock +++ b/security_scanning/examples/ray_orchestrator/poetry.lock @@ -217,13 +217,13 @@ files = [ [[package]] name = "cachetools" -version = "6.2.2" +version = "6.2.3" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.9" files = [ - {file = "cachetools-6.2.2-py3-none-any.whl", hash = "sha256:6c09c98183bf58560c97b2abfcedcbaf6a896a490f534b031b661d3723b45ace"}, - {file = "cachetools-6.2.2.tar.gz", hash = "sha256:8e6d266b25e539df852251cfd6f990b4bc3a141db73b939058d809ebd2590fc6"}, + {file = "cachetools-6.2.3-py3-none-any.whl", hash = "sha256:3fde34f7033979efb1e79b07ae529c2c40808bdd23b0b731405a48439254fba5"}, + {file = "cachetools-6.2.3.tar.gz", hash = "sha256:64e0a4ddf275041dd01f5b873efa87c91ea49022b844b8c5d1ad3407c0f42f1f"}, ] [[package]] diff --git a/security_scanning/examples/redrafter/poetry.lock b/security_scanning/examples/redrafter/poetry.lock index c23e474e3b..52ca91a2b6 100644 --- a/security_scanning/examples/redrafter/poetry.lock +++ b/security_scanning/examples/redrafter/poetry.lock @@ -771,13 +771,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "1.2.2" +version = "1.2.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.9.0" files = [ - {file = "huggingface_hub-1.2.2-py3-none-any.whl", hash = "sha256:0f55d7d22058fbf8b29d8095aeee80a7b695aa764f906a21e886c1f87223718f"}, - {file = "huggingface_hub-1.2.2.tar.gz", hash = "sha256:b5b97bd37f4fe5b898a467373044649c94ee32006c032ce8fb835abe9d92ea28"}, + {file = "huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642"}, + {file = "huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7"}, ] [package.dependencies] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 23ee601d5b..ad9a6f4b94 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "e8efeb765d7b2a23e123e80ed10dc7f98348e790", - "timestamp": "2025-12-12T02:39:32Z" + "commit_hash": "461446045e62028ff6bc6516f8e5858e26890742", + "timestamp": "2025-12-13T02:58:07Z" } diff --git a/security_scanning/tests/integration/defs/perf/poetry.lock b/security_scanning/tests/integration/defs/perf/poetry.lock index f5bc59f321..811326cd00 100644 --- a/security_scanning/tests/integration/defs/perf/poetry.lock +++ b/security_scanning/tests/integration/defs/perf/poetry.lock @@ -93,61 +93,61 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] [[package]] name = "fonttools" -version = "4.61.0" +version = "4.61.1" description = "Tools to manipulate font files" optional = false python-versions = ">=3.10" files = [ - {file = "fonttools-4.61.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dc25a4a9c1225653e4431a9413d0381b1c62317b0f543bdcec24e1991f612f33"}, - {file = "fonttools-4.61.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b493c32d2555e9944ec1b911ea649ff8f01a649ad9cba6c118d6798e932b3f0"}, - {file = "fonttools-4.61.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad751319dc532a79bdf628b8439af167181b4210a0cd28a8935ca615d9fdd727"}, - {file = "fonttools-4.61.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2de14557d113faa5fb519f7f29c3abe4d69c17fe6a5a2595cc8cda7338029219"}, - {file = "fonttools-4.61.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:59587bbe455dbdf75354a9dbca1697a35a8903e01fab4248d6b98a17032cee52"}, - {file = "fonttools-4.61.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:46cb3d9279f758ac0cf671dc3482da877104b65682679f01b246515db03dbb72"}, - {file = "fonttools-4.61.0-cp310-cp310-win32.whl", hash = "sha256:58b4f1b78dfbfe855bb8a6801b31b8cdcca0e2847ec769ad8e0b0b692832dd3b"}, - {file = "fonttools-4.61.0-cp310-cp310-win_amd64.whl", hash = "sha256:68704a8bbe0b61976262b255e90cde593dc0fe3676542d9b4d846bad2a890a76"}, - {file = "fonttools-4.61.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a32a16951cbf113d38f1dd8551b277b6e06e0f6f776fece0f99f746d739e1be3"}, - {file = "fonttools-4.61.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:328a9c227984bebaf69f3ac9062265f8f6acc7ddf2e4e344c63358579af0aa3d"}, - {file = "fonttools-4.61.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f0bafc8a3b3749c69cc610e5aa3da832d39c2a37a68f03d18ec9a02ecaac04a"}, - {file = "fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b5ca59b7417d149cf24e4c1933c9f44b2957424fc03536f132346d5242e0ebe5"}, - {file = "fonttools-4.61.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:df8cbce85cf482eb01f4551edca978c719f099c623277bda8332e5dbe7dba09d"}, - {file = "fonttools-4.61.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7fb5b84f48a6a733ca3d7f41aa9551908ccabe8669ffe79586560abcc00a9cfd"}, - {file = "fonttools-4.61.0-cp311-cp311-win32.whl", hash = "sha256:787ef9dfd1ea9fe49573c272412ae5f479d78e671981819538143bec65863865"}, - {file = "fonttools-4.61.0-cp311-cp311-win_amd64.whl", hash = "sha256:14fafda386377b6131d9e448af42d0926bad47e038de0e5ba1d58c25d621f028"}, - {file = "fonttools-4.61.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e24a1565c4e57111ec7f4915f8981ecbb61adf66a55f378fdc00e206059fcfef"}, - {file = "fonttools-4.61.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e2bfacb5351303cae9f072ccf3fc6ecb437a6f359c0606bae4b1ab6715201d87"}, - {file = "fonttools-4.61.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0bdcf2e29d65c26299cc3d502f4612365e8b90a939f46cd92d037b6cb7bb544a"}, - {file = "fonttools-4.61.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e6cd0d9051b8ddaf7385f99dd82ec2a058e2b46cf1f1961e68e1ff20fcbb61af"}, - {file = "fonttools-4.61.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e074bc07c31406f45c418e17c1722e83560f181d122c412fa9e815df0ff74810"}, - {file = "fonttools-4.61.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a9b78da5d5faa17e63b2404b77feeae105c1b7e75f26020ab7a27b76e02039f"}, - {file = "fonttools-4.61.0-cp312-cp312-win32.whl", hash = "sha256:9821ed77bb676736b88fa87a737c97b6af06e8109667e625a4f00158540ce044"}, - {file = "fonttools-4.61.0-cp312-cp312-win_amd64.whl", hash = "sha256:0011d640afa61053bc6590f9a3394bd222de7cfde19346588beabac374e9d8ac"}, - {file = "fonttools-4.61.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba774b8cbd8754f54b8eb58124e8bd45f736b2743325ab1a5229698942b9b433"}, - {file = "fonttools-4.61.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c84b430616ed73ce46e9cafd0bf0800e366a3e02fb7e1ad7c1e214dbe3862b1f"}, - {file = "fonttools-4.61.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2b734d8391afe3c682320840c8191de9bd24e7eb85768dd4dc06ed1b63dbb1b"}, - {file = "fonttools-4.61.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a5c5fff72bf31b0e558ed085e4fd7ed96eb85881404ecc39ed2a779e7cf724eb"}, - {file = "fonttools-4.61.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:14a290c5c93fcab76b7f451e6a4b7721b712d90b3b5ed6908f1abcf794e90d6d"}, - {file = "fonttools-4.61.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:13e3e20a5463bfeb77b3557d04b30bd6a96a6bb5c15c7b2e7908903e69d437a0"}, - {file = "fonttools-4.61.0-cp313-cp313-win32.whl", hash = "sha256:6781e7a4bb010be1cd69a29927b0305c86b843395f2613bdabe115f7d6ea7f34"}, - {file = "fonttools-4.61.0-cp313-cp313-win_amd64.whl", hash = "sha256:c53b47834ae41e8e4829171cc44fec0fdf125545a15f6da41776b926b9645a9a"}, - {file = "fonttools-4.61.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:96dfc9bc1f2302224e48e6ee37e656eddbab810b724b52e9d9c13a57a6abad01"}, - {file = "fonttools-4.61.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3b2065d94e5d63aafc2591c8b6ccbdb511001d9619f1bca8ad39b745ebeb5efa"}, - {file = "fonttools-4.61.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e0d87e81e4d869549585ba0beb3f033718501c1095004f5e6aef598d13ebc216"}, - {file = "fonttools-4.61.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1cfa2eb9bae650e58f0e8ad53c49d19a844d6034d6b259f30f197238abc1ccee"}, - {file = "fonttools-4.61.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4238120002e68296d55e091411c09eab94e111c8ce64716d17df53fd0eb3bb3d"}, - {file = "fonttools-4.61.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b6ceac262cc62bec01b3bb59abccf41b24ef6580869e306a4e88b7e56bb4bdda"}, - {file = "fonttools-4.61.0-cp314-cp314-win32.whl", hash = "sha256:adbb4ecee1a779469a77377bbe490565effe8fce6fb2e6f95f064de58f8bac85"}, - {file = "fonttools-4.61.0-cp314-cp314-win_amd64.whl", hash = "sha256:02bdf8e04d1a70476564b8640380f04bb4ac74edc1fc71f1bacb840b3e398ee9"}, - {file = "fonttools-4.61.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:627216062d90ab0d98215176d8b9562c4dd5b61271d35f130bcd30f6a8aaa33a"}, - {file = "fonttools-4.61.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:7b446623c9cd5f14a59493818eaa80255eec2468c27d2c01b56e05357c263195"}, - {file = "fonttools-4.61.0-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:70e2a0c0182ee75e493ef33061bfebf140ea57e035481d2f95aa03b66c7a0e05"}, - {file = "fonttools-4.61.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9064b0f55b947e929ac669af5311ab1f26f750214db6dd9a0c97e091e918f486"}, - {file = "fonttools-4.61.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cb5e45a824ce14b90510024d0d39dae51bd4fbb54c42a9334ea8c8cf4d95cbe"}, - {file = "fonttools-4.61.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6e5ca8c62efdec7972dfdfd454415c4db49b89aeaefaaacada432f3b7eea9866"}, - {file = "fonttools-4.61.0-cp314-cp314t-win32.whl", hash = "sha256:63c7125d31abe3e61d7bb917329b5543c5b3448db95f24081a13aaf064360fc8"}, - {file = "fonttools-4.61.0-cp314-cp314t-win_amd64.whl", hash = "sha256:67d841aa272be5500de7f447c40d1d8452783af33b4c3599899319f6ef9ad3c1"}, - {file = "fonttools-4.61.0-py3-none-any.whl", hash = "sha256:276f14c560e6f98d24ef7f5f44438e55ff5a67f78fa85236b218462c9f5d0635"}, - {file = "fonttools-4.61.0.tar.gz", hash = "sha256:ec520a1f0c7758d7a858a00f090c1745f6cde6a7c5e76fb70ea4044a15f712e7"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"}, + {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"}, + {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"}, + {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"}, + {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"}, + {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"}, + {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"}, + {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"}, + {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"}, + {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"}, + {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"}, + {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"}, + {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"}, + {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"}, + {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"}, ] [package.extras] From 98d72c7648d797785a612bff37cf2b68d0f598a1 Mon Sep 17 00:00:00 2001 From: Faraz <58580514+farazkh80@users.noreply.github.com> Date: Fri, 12 Dec 2025 22:37:56 -0500 Subject: [PATCH 111/172] [None][feat] spark cublas LUT table for llama-8b-bf16 perf (#9811) Signed-off-by: Faraz Khoubsirat <58580514+farazkh80@users.noreply.github.com> --- cpp/tensorrt_llm/thop/cublasScaledMM.cpp | 77 ++------------- cpp/tensorrt_llm/thop/cublasScaledMMLut.h | 99 ++++++++++++++++++++ tensorrt_llm/_torch/models/modeling_llama.py | 12 ++- tensorrt_llm/_torch/modules/gated_mlp.py | 3 + 4 files changed, 122 insertions(+), 69 deletions(-) create mode 100644 cpp/tensorrt_llm/thop/cublasScaledMMLut.h diff --git a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp index ddf8024b91..4d3b368cbe 100644 --- a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "cublasScaledMMLut.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/userbuffers/ub_interface.h" @@ -22,10 +23,8 @@ #include "tensorrt_llm/runtime/torchUtils.h" #include "tensorrt_llm/thop/thUtils.h" #include "userbuffersTensor.h" -#include #include #include -#include using torch::Tensor; @@ -39,67 +38,7 @@ namespace using tensorrt_llm::common::check; using tensorrt_llm::common::CublasMMWrapper; - -struct hash_tuple -{ - size_t operator()(std::tuple const& x) const - { - return std::get<0>(x) ^ std::get<1>(x) ^ std::get<2>(x); - } -}; - -// got from cublasTest matmultFind -// {mp2, k, n}: {algo, m_tile, m_stages, m_numsK, m_reduction, m_swizzle, m_custom, m_cga} -using AlgoListType = std::unordered_map, std::array, hash_tuple>; - -// bf16*bf16->fp32->bf16 -AlgoListType spark_bf16_algo_list = { - // GPT-OSS-20b - //-m201088 -n1 -algo21 -m_tile11 -m_stages20 -m_workmem0 -k2880 - {{8, 2880, 201088}, {21, 11, 20, 1, 0, 0, 0, 0}}, - //-m32 -n1 -algo14 -m_reduction2 -m_numsK10 -m_workmem1024 -k2880 - {{8, 2880, 32}, {14, 0, 0, 10, 2, 0, 0, 0}}, - //-m32 -n2048 -algo21 -m_tile11 -m_stages13 -m_reduction1 -m_numsK9 -m_workmem1024 - //-k2880 - {{2048, 2880, 32}, {21, 11, 13, 9, 1, 0, 0, 0}}, - //-m32 -n2175 -algo21 -m_tile11 -m_stages19 -m_reduction1 -m_numsK11 - //-m_workmem1024 -k2880 - {{4096, 2880, 32}, {21, 11, 19, 11, 1, 0, 0, 0}}, - //-m5120 -n1 -algo23 -m_tile11 -m_stages8 -m_reduction1 -m_numsK2 - //-m_workmem1024 -k2880 - {{8, 2880, 5120}, {23, 11, 8, 2, 1, 0, 0, 0}}, - //-m5120 -n2048 -algo21 -m_tile20 -m_stages15 -m_workmem1024 -k2880 - {{2048, 2880, 5120}, {21, 20, 15, 1, 0, 0, 0, 0}}, - //-m5120 -n2175 -algo21 -m_tile20 -m_stages15 -m_workmem1024 -k2880 - {{4096, 2880, 5120}, {21, 20, 15, 1, 0, 0, 0, 0}}, - //-m2880 -n1 -algo23 -m_tile11 -m_stages14 -m_reduction1 -m_numsK24 -m_workmem1024 -k4096 - {{8, 4096, 2880}, {23, 11, 14, 24, 1, 0, 0, 0}}, - //-m2880 -n2048 -ldc2880 -Poutt -ldd2880 -Ps -Pscales -algo21 -m_tile20 -m_stages15 -m_workmem1024 -k4096 - {{2048, 4096, 2880}, {21, 20, 15, 1, 0, 0, 0, 0}}, - //-m2880 -n2175 -ldc2880 -Poutt -ldd2880 -Ps -Pscales -algo21 -m_tile20 -m_stages15 -m_workmem1024 -k4096 - {{4096, 4096, 2880}, {21, 20, 15, 1, 0, 0, 0, 0}}, -}; - -// bf16*bf16->fp32->bf16 -AlgoListType bf16_algo_list = { - // Deepseek v3/R1 router gemm - // [-algo66 -m_tile10 -m_stages35 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom3 -m_mma0 -m_cga2 -m_scheduling1] - {{8, 7168, 256}, {66, 10, 35, 1, 0, 0, 3, 2}}, - {{512, 7168, 256}, {66, 48, 35, 1, 0, 0, 0, 2}}, - {{1024, 7168, 256}, {66, 13, 35, 1, 0, 0, 1, 3}}, -}; - -// fp8*fp8->fp32->fp16 -AlgoListType fp8_algo_list = { - // Llama-3.1-70B - // [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom5 -m_mma0 -m_cga2 -m_scheduling1] - {{8, 8192, 8192}, {66, 393, 36, 1, 0, 0, 5, 2}}, - // [-algo66 -m_tile10 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga2 -m_scheduling1] - {{8, 8192, 57344}, {66, 10, 36, 1, 0, 0, 1, 2}}, - // Llama-3.3-70B TP4 (this is the default algo on B200. Here we aim to use the same algo on GB200.) - // [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga4 -m_scheduling1] - {{8, 8192, 14336}, {66, 393, 36, 1, 0, 1, 1, 4}}, -}; +using cublas_lut::AlgoListType; void set_algo_attr(cublasLtMatmulAlgo_t& algo, std::array const& attr_list) { @@ -127,17 +66,18 @@ bool find_special_algo(cublasLtMatmulAlgo_t& algo, std::shared_ptrfind({mp2, k, n}); algo_iter != algo_list->end()) { int const algoID = algo_iter->second[0]; check_cuda_error(cublasLtMatmulAlgoInit( cublasWrapper->getCublasLtHandle(), compType, scaleType, aType, bType, outType, outType, algoID, &algo)); + TLLM_LOG_DEBUG("Found special cublasLt algo for m=%d, k=%d, n=%d\n", m, k, n); set_algo_attr(algo, algo_iter->second); } else diff --git a/cpp/tensorrt_llm/thop/cublasScaledMMLut.h b/cpp/tensorrt_llm/thop/cublasScaledMMLut.h new file mode 100644 index 0000000000..f190de9a0b --- /dev/null +++ b/cpp/tensorrt_llm/thop/cublasScaledMMLut.h @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace torch_ext +{ +namespace cublas_lut +{ + +struct HashTuple +{ + size_t operator()(std::tuple const& x) const + { + return std::get<0>(x) ^ std::get<1>(x) ^ std::get<2>(x); + } +}; + +// {mp2, k, n}: {algo, m_tile, m_stages, m_numsK, m_reduction, m_swizzle, m_custom, m_cga} +using AlgoListType = std::unordered_map, std::array, HashTuple>; + +inline const AlgoListType spark_bf16_algo_list = { + // llama 8b instruct fp16 decode + // [-algo67 -m_tile6 -m_stages35 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom130 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 4096, 4096}, {67, 6, 35, 1, 0, 0, 130, 2}}, + // [-algo67 -m_tile393 -m_stages35 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom142 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 4096, 6144}, {67, 393, 35, 1, 0, 0, 142, 2}}, + // [-algo67 -m_tile393 -m_stages35 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom142 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 4096, 128256}, {67, 393, 35, 1, 0, 0, 142, 2}}, + + // gpt-oss mxfp4-fp16 decode + // [-algo67 -m_tile393 -m_stages35 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom142 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 2880, 201088}, {67, 393, 35, 1, 0, 0, 142, 2}}, + // [-algo14 -m_tile0 -m_stages35 -m_numsK10 -m_reduction2 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + {{8, 2880, 32}, {14, 0, 0, 10, 2, 0, 0, 0}}, + // [-algo21 -m_tile11 -m_stages13 -m_numsK9 -m_reduction1 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + //-k2880 + {{2048, 2880, 32}, {21, 11, 13, 9, 1, 0, 0, 0}}, + // [-algo21 -m_tile11 -m_stages19 -m_numsK11 -m_reduction1 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + //-m_workmem1024 -k2880 + {{4096, 2880, 32}, {21, 11, 19, 11, 1, 0, 0, 0}}, + // [-algo23 -m_tile11 -m_stages8 -m_numsK2 -m_reduction1 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + //-m_workmem1024 -k2880 + {{8, 2880, 5120}, {23, 11, 8, 2, 1, 0, 0, 0}}, + // [-algo21 -m_tile20 -m_stages15 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + {{2048, 2880, 5120}, {21, 20, 15, 1, 0, 0, 0, 0}}, + // [-algo21 -m_tile20 -m_stages15 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + {{4096, 2880, 5120}, {21, 20, 15, 1, 0, 0, 0, 0}}, + // [-algo23 -m_tile11 -m_stages14 -m_numsK24 -m_reduction1 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + {{8, 4096, 2880}, {23, 11, 14, 24, 1, 0, 0, 0}}, + // [-algo21 -m_tile20 -m_stages15 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + {{2048, 4096, 2880}, {21, 20, 15, 1, 0, 0, 0, 0}}, + // [-algo21 -m_tile20 -m_stages15 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom0 -m_mma0 -m_cga0 -m_scheduling1] + {{4096, 4096, 2880}, {21, 20, 15, 1, 0, 0, 0, 0}}, + +}; + +// bf16*bf16->fp32->bf16 +inline const AlgoListType bf16_algo_list = { + // Deepseek v3/R1 router gemm + // [-algo66 -m_tile10 -m_stages35 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom3 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 7168, 256}, {66, 10, 35, 1, 0, 0, 3, 2}}, + {{512, 7168, 256}, {66, 48, 35, 1, 0, 0, 0, 2}}, + {{1024, 7168, 256}, {66, 13, 35, 1, 0, 0, 1, 3}}, +}; + +// fp8*fp8->fp32->fp16 +inline const AlgoListType fp8_algo_list = { + // Llama-3.1-70B + // [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom5 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 8192, 8192}, {66, 393, 36, 1, 0, 0, 5, 2}}, + // [-algo66 -m_tile10 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga2 -m_scheduling1] + {{8, 8192, 57344}, {66, 10, 36, 1, 0, 0, 1, 2}}, + // Llama-3.3-70B TP4 (this is the default algo on B200. Here we aim to use the same algo on GB200.) + // [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga4 -m_scheduling1] + {{8, 8192, 14336}, {66, 393, 36, 1, 0, 1, 1, 4}}, +}; + +} // namespace cublas_lut +} // namespace torch_ext diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 2cf2cc7410..c09abcb1da 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -230,6 +230,7 @@ class LlamaAttention(Attention): self, model_config: ModelConfig[LlamaConfig], layer_idx: Optional[int] = None, + use_custom_cublas_mm: bool = False, ): config = model_config.pretrained_config super().__init__( @@ -245,6 +246,7 @@ class LlamaAttention(Attention): layer_idx=layer_idx, dtype=config.torch_dtype, config=model_config, + use_custom_cublas_mm=use_custom_cublas_mm, ) @@ -618,6 +620,7 @@ class LlamaDecoderLayer(DecoderLayer): self, model_config: ModelConfig[LlamaConfig], layer_idx: int, + use_custom_cublas_mm: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor]: super().__init__() config = model_config.pretrained_config @@ -634,6 +637,7 @@ class LlamaDecoderLayer(DecoderLayer): self.self_attn = LlamaAttention( model_config, layer_idx=layer_idx, + use_custom_cublas_mm=use_custom_cublas_mm, ) self.mlp = GatedMLP( @@ -643,6 +647,7 @@ class LlamaDecoderLayer(DecoderLayer): dtype=config.torch_dtype, config=model_config, layer_idx=layer_idx, + use_custom_cublas_mm=use_custom_cublas_mm, ) self.input_layernorm = RMSNorm(hidden_size=config.hidden_size, eps=config.rms_norm_eps, @@ -889,6 +894,8 @@ class LlamaModel(DecoderModel): config = self.model_config.pretrained_config self.num_hidden_layers = config.num_hidden_layers + self.use_custom_cublas_mm = get_sm_version() == 121 + vocab_size = config.vocab_size # TODO smor- we load manually only if there is a single lora dir, need to come up with a better solution self.has_custom_embed_tokens = False @@ -909,6 +916,7 @@ class LlamaModel(DecoderModel): vocab_size, config.hidden_size, dtype=config.torch_dtype, + use_custom_cublas_mm=self.use_custom_cublas_mm, ) else: self.embed_tokens = Embedding( @@ -918,6 +926,7 @@ class LlamaModel(DecoderModel): mapping=model_config.mapping, tensor_parallel_mode=TensorParallelMode.COLUMN, gather_output=True, + use_custom_cublas_mm=self.use_custom_cublas_mm, ) if self.has_custom_embed_tokens: @@ -932,7 +941,8 @@ class LlamaModel(DecoderModel): self.embed_tokens.weight.data.copy_(x) self.layers = nn.ModuleList([ - LlamaDecoderLayer(model_config, layer_idx) + LlamaDecoderLayer(model_config, layer_idx, + self.use_custom_cublas_mm) for layer_idx in range(config.num_hidden_layers) ]) self.norm = RMSNorm(hidden_size=config.hidden_size, diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py index c1200c7d75..ac3ccb3783 100644 --- a/tensorrt_llm/_torch/modules/gated_mlp.py +++ b/tensorrt_llm/_torch/modules/gated_mlp.py @@ -32,6 +32,7 @@ class GatedMLP(nn.Module): layer_idx: Optional[int] = None, use_cute_dsl_blockscaling_mm: bool = False, disable_deep_gemm: bool = False, + use_custom_cublas_mm: bool = False, ): super().__init__() @@ -83,6 +84,7 @@ class GatedMLP(nn.Module): use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, disable_deep_gemm=disable_deep_gemm, fused_weight_shard_indices_mapping=gateup_shard_indices_mapping, + use_custom_cublas_mm=use_custom_cublas_mm, ) self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H], @@ -103,6 +105,7 @@ class GatedMLP(nn.Module): force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, disable_deep_gemm=disable_deep_gemm, + use_custom_cublas_mm=use_custom_cublas_mm, ) # These two modules are mutually exclusive - either splitted_gate_up_lora or fused_gate_up_lora will be used, From e49c70f6dfe52dbd7b3c793ed7bb384be9e35cbc Mon Sep 17 00:00:00 2001 From: bhsueh_NV <11360707+byshiue@users.noreply.github.com> Date: Sat, 13 Dec 2025 11:44:27 +0800 Subject: [PATCH 112/172] [None][feat] Support Mistral Large3 LLM part (#9820) Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- examples/llm-api/quickstart_advanced.py | 6 + .../models/core/mistral_large_3/README.md | 53 +++ requirements.txt | 1 + .../_torch/models/checkpoints/__init__.py | 29 +- .../models/checkpoints/hf/weight_loader.py | 1 + .../models/checkpoints/mistral/__init__.py | 0 .../checkpoints/mistral/checkpoint_loader.py | 75 +++++ .../checkpoints/mistral/config_loader.py | 314 ++++++++++++++++++ .../checkpoints/mistral/weight_mapper.py | 131 ++++++++ .../_torch/models/modeling_deepseekv3.py | 24 +- .../_torch/models/modeling_mistral.py | 184 +++++++--- .../_torch/models/modeling_mistral_large3.py | 70 ++++ .../_torch/pyexecutor/config_utils.py | 8 + tensorrt_llm/inputs/registry.py | 6 + tensorrt_llm/serve/openai_server.py | 3 +- .../defs/accuracy/references/gsm8k.yaml | 2 + .../defs/accuracy/references/mmlu.yaml | 2 + .../defs/accuracy/test_llm_api_pytorch.py | 104 ++++++ .../test_lists/test-db/l0_dgx_b200.yml | 1 + .../test-db/l0_gb200_multi_gpus.yml | 1 + 20 files changed, 946 insertions(+), 69 deletions(-) create mode 100644 examples/models/core/mistral_large_3/README.md create mode 100644 tensorrt_llm/_torch/models/checkpoints/mistral/__init__.py create mode 100644 tensorrt_llm/_torch/models/checkpoints/mistral/checkpoint_loader.py create mode 100644 tensorrt_llm/_torch/models/checkpoints/mistral/config_loader.py create mode 100644 tensorrt_llm/_torch/models/checkpoints/mistral/weight_mapper.py create mode 100644 tensorrt_llm/_torch/models/modeling_mistral_large3.py diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py index 9b37f8c7b2..5aa7f7ce70 100644 --- a/examples/llm-api/quickstart_advanced.py +++ b/examples/llm-api/quickstart_advanced.py @@ -23,6 +23,11 @@ def add_llm_args(parser): type=str, nargs="+", help="A single or a list of text prompts.") + parser.add_argument('--checkpoint_format', + type=str, + default=None, + choices=["HF", "mistral"], + help="Model checkpoint format.") # Build config parser.add_argument("--max_seq_len", type=int, @@ -237,6 +242,7 @@ def setup_llm(args, **kwargs): llm = LLM( model=args.model_dir, backend='pytorch', + checkpoint_format=args.checkpoint_format, disable_overlap_scheduler=args.disable_overlap_scheduler, kv_cache_config=kv_cache_config, attn_backend=args.attention_backend, diff --git a/examples/models/core/mistral_large_3/README.md b/examples/models/core/mistral_large_3/README.md new file mode 100644 index 0000000000..dfd3fd0c28 --- /dev/null +++ b/examples/models/core/mistral_large_3/README.md @@ -0,0 +1,53 @@ +# Mistral Large V3 + +* Setup the model path + +```bash +export mistral_large_3_model_path= +``` + +## LLM-only run + +* Run the Mistral Large V3 by `quickstart_advanced.py` + +```bash +mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickstart_advanced.py \ + --model_dir ${mistral_large_3_model_path} \ + --tp_size 4 \ + --moe_ep_size 4 \ + --max_tokens 100 \ + --checkpoint_format mistral \ + --moe_backend TRTLLM +``` + +* Launch the trtllm-serve and send a request + +```bash +echo " +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +enable_attention_dp: false +kv_cache_config: + enable_block_reuse: true +checkpoint_format: mistral +" > serve.yml +mpirun -n 1 --allow-run-as-root --oversubscribe python3 -m tensorrt_llm.commands.serve serve \ + ${mistral_large_3_model_path} \ + --host localhost --port 8001 --backend pytorch \ + --extra_llm_api_options serve.yml \ + --tokenizer ${mistral_large_3_model_path} \ + 2>&1 | tee serve_debug.log & + +curl http://localhost:8001/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "${mistral_large_3_model_path}", + "prompt": "The capital of France is", + "max_tokens": 16, + "top_k": 16 + }' + +# The result would be like +{"id":"cmpl-7e342c1d722d4226a1bf3ed35d762c35","object":"text_completion","created":1764061351,"model":"${mistral_large_3_model_path}","choices":[{"index":0,"text":"The capital of France is **Paris**.\n\nParis is the largest city in France and","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":7,"total_tokens":23,"completion_tokens":16,"prompt_tokens_details":{"cached_tokens":1}},"prompt_token_ids":null} +``` diff --git a/requirements.txt b/requirements.txt index e123aafcde..8f740a9ede 100644 --- a/requirements.txt +++ b/requirements.txt @@ -75,3 +75,4 @@ numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing partial_json_parser apache-tvm-ffi==0.1.4 # used for reduce nvidia-cutlass-dsl host overhead torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf +mistral-common==1.8.6 diff --git a/tensorrt_llm/_torch/models/checkpoints/__init__.py b/tensorrt_llm/_torch/models/checkpoints/__init__.py index 6a7426eb5b..590a4c7ea9 100644 --- a/tensorrt_llm/_torch/models/checkpoints/__init__.py +++ b/tensorrt_llm/_torch/models/checkpoints/__init__.py @@ -12,11 +12,30 @@ from .hf.qwen3_moe_weight_mapper import Qwen3MoeHfWeightMapper from .hf.qwen3_next_weight_mapper import Qwen3NextHfWeightMapper from .hf.weight_loader import HfWeightLoader from .hf.weight_mapper import HfWeightMapper +from .mistral.checkpoint_loader import (MistralCheckpointLoader, + MistralLarge3CheckpointLoader) +from .mistral.config_loader import MistralConfigLoader +from .mistral.weight_mapper import (MistralLarge3WeightMapper, + MistralWeightMapper) __all__ = [ - "HfConfigLoader", "HfWeightLoader", "HfWeightMapper", - "BaseCheckpointLoader", "HfCheckpointLoader", "NemotronHHfWeightMapper", - "Gemma3HfWeightMapper", "MixtralHfWeightMapper", "Llama4HfWeightMapper", - "Qwen2MoeHfWeightMapper", "Qwen3MoeHfWeightMapper", "Qwen2VLHfWeightMapper", - "Qwen3NextHfWeightMapper", "LlavaNextHfWeightMapper" + "HfConfigLoader", + "HfWeightLoader", + "HfWeightMapper", + "MistralConfigLoader", + "MistralWeightMapper", + "MistralCheckpointLoader", + "BaseCheckpointLoader", + "HfCheckpointLoader", + "NemotronHHfWeightMapper", + "Gemma3HfWeightMapper", + "MixtralHfWeightMapper", + "Llama4HfWeightMapper", + "Qwen2MoeHfWeightMapper", + "Qwen3MoeHfWeightMapper", + "Qwen2VLHfWeightMapper", + "Qwen3NextHfWeightMapper", + "LlavaNextHfWeightMapper", + "MistralLarge3CheckpointLoader", + "MistralLarge3WeightMapper", ] diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py index 7c24f19ae7..3b1c3af172 100644 --- a/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py +++ b/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py @@ -19,6 +19,7 @@ from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping +@register_checkpoint_weight_loader("mistral") @register_checkpoint_weight_loader("HF") class HfWeightLoader(BaseWeightLoader): """ diff --git a/tensorrt_llm/_torch/models/checkpoints/mistral/__init__.py b/tensorrt_llm/_torch/models/checkpoints/mistral/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorrt_llm/_torch/models/checkpoints/mistral/checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/mistral/checkpoint_loader.py new file mode 100644 index 0000000000..433bde665b --- /dev/null +++ b/tensorrt_llm/_torch/models/checkpoints/mistral/checkpoint_loader.py @@ -0,0 +1,75 @@ +from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader +from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader +from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import BaseWeightMapper +from tensorrt_llm._torch.models.checkpoints.hf.checkpoint_loader import HfCheckpointLoader +from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import MistralConfigLoader +from tensorrt_llm._torch.models.modeling_utils import register_checkpoint_loader + + +@register_checkpoint_loader("mistral") +class MistralCheckpointLoader(HfCheckpointLoader): + def __init__( + self, + *, + weight_loader: BaseWeightLoader | None = None, + weight_mapper: BaseWeightMapper | None = None, + config_loader: BaseConfigLoader | None = None, + ): + super().__init__( + weight_loader=weight_loader, weight_mapper=weight_mapper, config_loader=config_loader + ) + self._checkpoint_format = "mistral" + self.mm_module_mapping = { + "vision_encoder": "vision_tower", + "pre_mm_projector_norm": "multi_modal_projector.norm", + "vision_language_adapter": "multi_modal_projector", + "patch_merger": "multi_modal_projector.patch_merger", + } + + def preprocess_weights(self, weights: dict) -> dict: + """ + Aggregate weights by module + """ + hf_weights = {} + + for key, value in weights.items(): + modules = key.split(".") + + if modules[0] not in self.mm_module_mapping.keys(): + hf_weights["language_model." + key] = value + + else: + modules[0] = self.mm_module_mapping[modules[0]] + hf_weights[".".join(modules)] = value + + return hf_weights + + def inverse_nvfp4_global_scales(self, weights): + for key in weights.keys(): + if "global_scale" in key: + weights[key] = 1.0 / weights[key] + + def load_weights(self, checkpoint_dir: str, **kwargs): + weights = super().weight_loader.load_weights(checkpoint_dir, **kwargs) + weights = self.preprocess_weights(weights) + # The definition of global_scale is different in Mistral, need to inverse the scale + self.inverse_nvfp4_global_scales(weights) + return weights + + def get_default_config_loader(self) -> MistralConfigLoader: + return MistralConfigLoader() + + +@register_checkpoint_loader("mistral_large_3") +class MistralLarge3CheckpointLoader(MistralCheckpointLoader): + def __init__( + self, + *, + weight_loader: BaseWeightLoader | None = None, + weight_mapper: BaseWeightMapper | None = None, + config_loader: BaseConfigLoader | None = None, + ): + super().__init__( + weight_loader=weight_loader, weight_mapper=weight_mapper, config_loader=config_loader + ) + self._checkpoint_format = "mistral_large_3" diff --git a/tensorrt_llm/_torch/models/checkpoints/mistral/config_loader.py b/tensorrt_llm/_torch/models/checkpoints/mistral/config_loader.py new file mode 100644 index 0000000000..95e93fdc05 --- /dev/null +++ b/tensorrt_llm/_torch/models/checkpoints/mistral/config_loader.py @@ -0,0 +1,314 @@ +import json +from pathlib import Path +from typing import Any + +from transformers import PretrainedConfig, WhisperConfig + +from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader +from tensorrt_llm._torch.models.modeling_utils import register_config_loader +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization.mode import QuantAlgo + +################### +# vllm code here +# https://github.com/vllm-project/vllm/blob/48a5fff66e78985a634abac0d8d7f271da744000/vllm/transformers_utils/configs/mistral.py +################### + + +def adapt_config_dict( + config_dict: dict[str, Any], + defaults: dict[str, Any] = {}, +) -> PretrainedConfig: + config_dict = _remap_general_mistral_args(config_dict) + + if bool(config_dict.get("quantization")): + config_dict = _remap_mistral_quantization_args(config_dict) + + is_moe = bool(config_dict.get("moe")) + is_mistral_large_3 = is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0 + if config_dict.get("model_type") == "mamba": + config_dict["architectures"] = ["Mamba2ForCausalLM"] + elif is_moe and is_mistral_large_3: + config_dict = _remap_moe_args(config_dict) + config_dict["model_type"] = "deepseek_v3" + config_dict["architectures"] = ["MistralLarge3ForCausalLM"] + + assert "llama_4_scaling" in config_dict, "MistralLarge3 expect llama4 scaling config." + llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"] + assert all( + [key in config_dict["llama_4_scaling"] for key in llama_4_scaling_config_keys] + ), f"llama_4_scaling config should define the keys: {','.join(llama_4_scaling_config_keys)}" + elif is_moe: + config_dict["architectures"] = ["MixtralForCausalLM"] + else: + config_dict["architectures"] = ["MistralForCausalLM"] + + if bool(config_dict.get("yarn")): + config_dict = _remap_mistral_yarn_args(config_dict) + + if bool(config_dict.get("llama_4_scaling")): + llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"] + assert all( + [key in config_dict["llama_4_scaling"] for key in llama_4_scaling_config_keys] + ), f"llama_4_scaling config should define the keys: {','.join(llama_4_scaling_config_keys)}" + + is_vision = (config_dict.get("multimodal") or {}).get("vision_encoder_args") or config_dict.get( + "vision_encoder" + ) + is_audio = bool( + ((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get("encoder_args") + ) + + assert not (is_vision and is_audio), "Vision and audio are mutually exclusive" + + if is_vision: + config_dict = _remap_mistral_vision_args(config_dict) + if is_audio: + config_dict = _remap_mistral_audio_args(config_dict) + + for k, v in defaults.items(): + config_dict.setdefault(k, v) + + config = PretrainedConfig.from_dict(config_dict) + + return config + + +def _remap_mistral_vision_args(config: dict) -> dict: + if config.get("multimodal"): + vision_config = config.pop("multimodal") + else: + vision_config = config.pop("vision_encoder") + + quant_config = config.get("quantization_config") + config = { + "model_type": "pixtral", + "architectures": ["PixtralForConditionalGeneration"], + "text_config": PretrainedConfig.from_dict(config), + "vision_config": PretrainedConfig.from_dict(vision_config), + } + if quant_config: + config["quantization_config"] = quant_config + return config + + +def _remap_mistral_yarn_args(config: dict) -> dict: + yarn_config_map = { + "factor": "factor", + "original_max_position_embeddings": "original_max_position_embeddings", + "beta": "beta_fast", + "alpha": "beta_slow", + "apply_scale": "apply_yarn_scaling", + } + yarn_config = config.get("yarn") or {} + config["rope_parameters"] = { + "rope_type": "yarn", + "mscale_all_dim": 1, + } + + if rope_theta := config.pop("rope_theta", None): + config["rope_parameters"]["rope_theta"] = rope_theta + + for old_name, new_name in yarn_config_map.items(): + if old_name in yarn_config: + config["rope_parameters"][new_name] = yarn_config.pop(old_name) + + assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}" + + return config + + +def _remap_general_mistral_args(config: dict) -> dict: + # Mistral key -> HF key + config_mapping = { + "dim": "hidden_size", + "norm_eps": "rms_norm_eps", + "n_kv_heads": "num_key_value_heads", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "hidden_dim": "intermediate_size", + } + # HF key -> (Mistral key, default value) + top_level_mapping_with_default = { + "model_type": ("model_type", "transformer"), + "hidden_act": ("activation", "silu"), + "tie_word_embeddings": ("tied_embeddings", False), + "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), + "max_position_embeddings": ("max_position_embeddings", 128_000), + } + + for key, new_key in config_mapping.items(): + if key in config: + config[new_key] = config.pop(key) + + for new_key, (key, default_value) in top_level_mapping_with_default.items(): + config[new_key] = config.pop(key, default_value) + + return config + + +def _remap_mistral_quantization_args(config: dict) -> dict: + if config.get("quantization"): + quantization = config.pop("quantization", {}) + if quantization.get("qformat_weight") == "fp8_e4m3": + qscheme_act = quantization.get("qscheme_act") + assert qscheme_act in ("NO_SCALES", "TENSOR", None), ( + "Only NO_SCALES and TENSOR (default) are supported for qscheme_act" + ) + is_dynamic = qscheme_act == "NO_SCALES" + config["quantization_config"] = { + "quant_method": "fp8", + "activation_scheme": "dynamic" if is_dynamic else "static", + } + else: + raise ValueError(f"Found unknown quantization='{quantization}' in config") + + return config + + +def _remap_mistral_audio_args(config: dict) -> dict: + whisper_args = config["multimodal"].pop("whisper_model_args") + encoder_args = whisper_args["encoder_args"] + downsample_args = whisper_args["downsample_args"] + + quant_config = config.get("quantization_config") + config = { + "model_type": "whixtral", + "architectures": ["VoxtralForConditionalGeneration"], + "text_config": PretrainedConfig.from_dict(config), + "audio_config": WhisperConfig( + num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"], + window_size=encoder_args["audio_encoding_args"]["window_size"], + sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"], + hop_length=encoder_args["audio_encoding_args"]["hop_length"], + downsample_factor=downsample_args["downsample_factor"], + d_model=encoder_args["dim"], + encoder_layers=encoder_args["n_layers"], + encoder_ffn_dim=encoder_args["hidden_dim"], + encoder_attention_heads=encoder_args["n_heads"], + vocab_size=encoder_args["vocab_size"], + max_source_positions=encoder_args["max_source_positions"], + is_encoder_decoder=False, # Override WhisperConfig default + ), + } + if quant_config: + config["quantization_config"] = quant_config + return config + + +def _remap_moe_args(config: dict) -> dict: + moe_config_map = { + "route_every_n": "moe_layer_freq", + "first_k_dense_replace": "first_k_dense_replace", + "num_experts_per_tok": "num_experts_per_tok", + "num_experts": "n_routed_experts", + "expert_hidden_dim": "moe_intermediate_size", + "routed_scale": "routed_scaling_factor", + "num_shared_experts": "n_shared_experts", + "num_expert_groups": "n_group", + "num_expert_groups_per_tok": "topk_group", + } + moe_config = config.get("moe", {}) + for old_name, new_name in moe_config_map.items(): + if old_name in moe_config: + value = moe_config.pop(old_name) + config[new_name] = value + + config["topk_method"] = None + config["norm_topk_prob"] = True + config["scoring_func"] = "softmax" + + return config + + +###################### +# End of vllm code +###################### + + +@register_config_loader("mistral") +@register_config_loader("mistral_large_3") +class MistralConfigLoader(BaseConfigLoader): + def _load_mistral_config_dict(self, checkpoint_dir: str, config_file_name: str) -> dict | None: + file_path = Path(checkpoint_dir) / Path(config_file_name) + + if file_path.exists() and file_path.is_file(): + with open(file_path) as file: + return json.load(file) + return None + + # Adaptation of + # https://github.com/vllm-project/vllm/blob/48a5fff66e78985a634abac0d8d7f271da744000/vllm/transformers_utils/config.py#L175 + def _parse_mistral_config(self, checkpoint_dir: str): + config_file_name = "params.json" + + # This function loads a params.json config which + # should be used when loading models in mistral format + config_dict = self._load_mistral_config_dict(checkpoint_dir, config_file_name) + if config_dict is None: + raise ValueError( + f"Failed to load '{config_file_name}' config from '{checkpoint_dir}'. " + f"Only local checkpoints are supported for mistral format." + ) + assert isinstance(config_dict, dict) + + if (max_position_embeddings := config_dict.get("max_position_embeddings")) is None: + max_position_embeddings = 128_000 + config_dict["max_position_embeddings"] = max_position_embeddings + + pretrained_config = adapt_config_dict(config_dict) + + # Mistral configs may define sliding_window as list[int]. Convert it + # to int and add the layer_types list[str] to make it HF compatible + if (sliding_window := getattr(pretrained_config, "sliding_window", None)) and isinstance( + sliding_window, list + ): + pattern_repeats = pretrained_config.num_hidden_layers // len(sliding_window) + layer_types = sliding_window * pattern_repeats + pretrained_config.layer_types = [ + "full_attention" if layer_type is None else "sliding_attention" + for layer_type in layer_types + ] + pretrained_config.sliding_window = next(filter(None, sliding_window), None) + + return config_dict, pretrained_config + + def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig: + # Re-write from ModelConfig.from_pretrained + + config_dict, pretrained_config = self._parse_mistral_config(checkpoint_dir) + + # Some checkpoints lack torch_dtype, populate with dtype + pretrained_config.torch_dtype = getattr(pretrained_config, "dtype", None) + quant_config = QuantConfig() + layer_quant_config = None + + hf_quant_config = pretrained_config.quantization_config + if hf_quant_config.get("quant_method") == "compressed-tensors": + if "NVFP4" in hf_quant_config.get("config_groups"): + quant_config.quant_algo = QuantAlgo.NVFP4 + quant_config.group_size = 16 + ignore_list = hf_quant_config.get("ignore", []) + quant_config.exclude_modules = [] + if "re:.*attn.*" in ignore_list: + quant_config.exclude_modules.append("model.layers.*.self_attn.*") + if "re:vision_encoder.*" in ignore_list: + quant_config.exclude_modules.append("vision_encoder*") + if "re:vision_language_adapter.*" in ignore_list: + quant_config.exclude_modules.append("vision_language_adapter*") + + elif "FP8_BLOCK" in hf_quant_config.get("config_groups"): + quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES + quant_config.group_size = 128 + quant_config.exclude_modules = ["*q_a_proj*", "*kv_a_proj_with_mqa*"] + + kwargs.pop("trust_remote_code", None) # ModelConfig does not have this input parameter + model_config = ModelConfig( + pretrained_config=pretrained_config, + quant_config=quant_config, + quant_config_dict=layer_quant_config, + **kwargs, + ) + model_config._frozen = True + return model_config diff --git a/tensorrt_llm/_torch/models/checkpoints/mistral/weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/mistral/weight_mapper.py new file mode 100644 index 0000000000..28362f1f90 --- /dev/null +++ b/tensorrt_llm/_torch/models/checkpoints/mistral/weight_mapper.py @@ -0,0 +1,131 @@ +from torch import nn + +from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper +from tensorrt_llm._torch.models.modeling_utils import register_mapper + + +@register_mapper("mistral", "MistralForCausalLM") +@register_mapper("mistral", "PixtralForConditionalGeneration") +class MistralWeightMapper(HfWeightMapper): + def __init__(self): + super().__init__() + + self._callbacks.append(self._permute_qk) + + self.pixtral_mapping = { + "wq": "q_proj", + "wk": "k_proj", + "wv": "v_proj", + "wo": "o_proj", + "w1": "gate_proj", + "w2": "down_proj", + "w3": "up_proj", + "w_in": "linear_1", + "w_out": "linear_2", + } + + self.mistral_llm_mapping = { + "layers": "model.layers", + "attention": "self_attn", + "qscale_act": "input_scale", + "qscale_weight": "weight_scale_inv", + "kv_fake_quantizer.qscale_act": "kv_scale", + "q_fake_quantizer.qscale_act": "attn.q_scale", + "k_fake_quantizer.qscale_act": "k_scale", + "v_fake_quantizer.qscale_act": "v_scale", + "attention_norm": "input_layernorm", + "feed_forward": "mlp", + "ffn_norm": "post_attention_layernorm", + "tok_embeddings": "model.embed_tokens", + "output": "lm_head", + "norm": "model.norm", + # For Eagle3 + "language_model.eagle_linear": "model.fc", + "language_model.layers": "layers", + "language_model.norm": "norm", + } + self.mistral_llm_mapping.update(self.pixtral_mapping) + + # Adapted from: + # https://github.com/vllm-project/vllm/blob/883b42896a9ed9791750d721fad26005b7569eba/vllm/model_executor/models/llama.py#L657 + def rename_by_params_map(self, params_map: dict[str, str], weights: dict) -> dict: + renamed_weights = {} + + for key in list(weights.keys()): + new_key = key + modules = key.split(".") + num_modules = len(modules) + for i in range(num_modules): + item = modules[i] + next_item = modules[i + 1] if i < num_modules - 1 else None + + combined_item = f"{item}.{next_item}" if next_item is not None else None + + if combined_item in params_map: + new_key = new_key.replace(combined_item, params_map[combined_item]) + elif item in params_map: + new_key = new_key.replace(item, params_map[item]) + + renamed_weights[new_key] = weights[key] + + return renamed_weights + + def _permute_qk(self, module: nn.Module, new_name: str, weights: dict): + # Adapted from: + # https://github.com/vllm-project/vllm/blob/883b42896a9ed9791750d721fad26005b7569eba/vllm/model_executor/models/llama.py#L657 + + processed_weights = {} + config = self.config.pretrained_config + + def permute(w, n_heads: int, attn_out: int): + attn_in = config.head_dim * n_heads + + return ( + w.view(n_heads, attn_in // n_heads // 2, 2, attn_out) + .transpose(1, 2) + .reshape(attn_in, attn_out) + ) + + # rotary embeds should be sliced + # If using quantized model in mistral format, + # quantization scales (qscale_weight) also need to be sliced + + if new_name in ["k_proj", "q_proj"]: + n_heads = ( + config.num_key_value_heads if new_name == "k_proj" else config.num_attention_heads + ) + + processed_weights["weight"] = permute(weights["weight"], n_heads, config.hidden_size) + + if "qscale_weight" in weights and weights["qscale_weight"].numel() > 1: + processed_weights["qscale_weight"] = permute(weights["qscale_weight"], n_heads, 1) + + return processed_weights + + return weights + + +@register_mapper("mistral_large_3") +@register_mapper("mistral_large_3", "PixtralForConditionalGeneration") +@register_mapper("mistral_large_3", "MistralLarge3ForCausalLM") +class MistralLarge3WeightMapper(MistralWeightMapper): + def __init__(self): + super().__init__() + + self.mistral_llm_mapping.update( + { + "wkv_a_with_mqa": "kv_a_proj_with_mqa", + "wkv_b": "kv_b_proj", + "wq_a": "q_a_proj", + "q_a_norm": "q_a_layernorm", + "wq_b": "q_b_proj", + "kv_a_norm": "kv_a_layernorm", + "k_fake_quantizer.qscale_act": "mla_attn.mla_attn.k_scale", + "q_fake_quantizer.qscale_act": "mla_attn.mla_attn.q_scale", + "v_fake_quantizer.qscale_act": "mla_attn.mla_attn.v_scale", + "gate": "mlp.gate", + "shared_experts": "mlp.shared_experts", + "experts": "mlp.experts", + "router_biases": "mlp.gate.e_score_correction_bias", + } + ) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 40fbaa983d..8df4eae706 100755 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -746,17 +746,19 @@ class Deepseekv3MoE(nn.Module): config = model_config.pretrained_config self.top_k = top_k self.use_dp = model_config.mapping.enable_attention_dp - self.gate = DeepseekV3Gate( - hidden_size, - num_experts, - top_k=top_k, - n_group=config.n_group, - topk_group=config.topk_group, - routed_scaling_factor=config.routed_scaling_factor, - dtype=dtype, - fuse_routing_kernel=True, - apply_routing=False, - moe_backend=model_config.moe_backend) + gate_cls = DeepseekV3Gate + if hasattr(model_config.pretrained_config, "gate_cls"): + gate_cls = model_config.pretrained_config.gate_cls + self.gate = gate_cls(hidden_size, + num_experts, + top_k=top_k, + n_group=config.n_group, + topk_group=config.topk_group, + routed_scaling_factor=config.routed_scaling_factor, + dtype=dtype, + fuse_routing_kernel=True, + apply_routing=False, + moe_backend=model_config.moe_backend) self.experts = create_moe( num_experts=num_experts, routing_method=self.gate.routing_method, diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py index 9ade4dee22..2667d20d55 100644 --- a/tensorrt_llm/_torch/models/modeling_mistral.py +++ b/tensorrt_llm/_torch/models/modeling_mistral.py @@ -1,6 +1,7 @@ +import copy import dataclasses import os -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Tuple import torch import torchvision @@ -14,11 +15,16 @@ from tensorrt_llm._torch.attention_backend.interface import ( PositionalEmbeddingParams, RopeParams) from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models import modeling_pixtral +from tensorrt_llm._torch.models.checkpoints.mistral.weight_mapper import \ + MistralWeightMapper +from tensorrt_llm._torch.models.modeling_mistral_large3 import ( + Mistral3Gate, MistralLarge3ForCausalLM) from tensorrt_llm._torch.models.modeling_multimodal_utils import ( find_input_mm_embeds, fuse_input_embeds, get_multimodal_embeddings) from tensorrt_llm._torch.models.modeling_utils import (DecoderModel, DecoderModelForCausalLM, _load_weights_impl, + filter_weights, register_auto_model) from tensorrt_llm._torch.modules.attention import Attention from tensorrt_llm._torch.modules.decoder_layer import DecoderLayer @@ -52,7 +58,7 @@ class MistralAttention(Attention): def __init__( self, model_config: ModelConfig[MistralConfig], - layer_idx: Optional[int] = None, + layer_idx: int | None = None, ): config = model_config.pretrained_config super().__init__( @@ -111,8 +117,8 @@ class MistralDecoderLayer(DecoderLayer): position_ids: torch.IntTensor, hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, - residual: Optional[torch.Tensor] = None, - spec_metadata: Optional[SpecMetadata] = None, + residual: torch.Tensor | None = None, + spec_metadata: SpecMetadata | None = None, **kwargs, ) -> torch.Tensor: if residual is None: @@ -169,11 +175,11 @@ class MistralModel(DecoderModel): def forward( self, attn_metadata: AttentionMetadata, - input_ids: Optional[torch.IntTensor] = None, - position_ids: Optional[torch.IntTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - spec_metadata: Optional[SpecMetadata] = None, - lora_params: Optional[Any] = None, + input_ids: torch.IntTensor | None = None, + position_ids: torch.IntTensor | None = None, + inputs_embeds: torch.FloatTensor | None = None, + spec_metadata: SpecMetadata | None = None, + lora_params: Any | None = None, ) -> torch.Tensor: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( @@ -222,7 +228,7 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, self, model_path: str, config: PretrainedConfig, - tokenizer: Optional[AutoTokenizer], + tokenizer: AutoTokenizer | None, trust_remote_code: bool = False, **kwargs, ): @@ -264,9 +270,11 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, @torch.inference_mode() def __call__( self, inputs: TextPrompt, sampling_params: SamplingParams - ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]: + ) -> Tuple[List[int], ExtraProcessedInputs | None]: images = inputs.get("multi_modal_data", {}).get("image") - do_rescale = self.processor.image_processor.do_rescale + mm_processor_kwargs = inputs.get("mm_processor_kwargs", {}) + do_rescale = getattr(self.processor.image_processor, "do_rescale", + False) if images is not None and isinstance(images[0], torch.Tensor): # The default multimodal input loader will normalize images to [0, 1] when the requested # format is "pt" (pytorch tensors), but not for "pil" (PIL images). @@ -276,6 +284,7 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, text=inputs["prompt"], images=images, do_rescale=do_rescale, + **mm_processor_kwargs, ) input_ids = processed.pop("input_ids").tolist()[0] # Remaining in `processed`: @@ -331,6 +340,7 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, @register_auto_model("Mistral3ForConditionalGeneration") +@register_auto_model("PixtralForConditionalGeneration") @register_input_processor( Mistral3InputProcessor, model_type="mistral3", @@ -365,34 +375,48 @@ class Mistral3VLM(PreTrainedModel): config = model_config.pretrained_config super().__init__(config) - self.model_config = model_config - - llm_model_config = self._get_sub_model_config(model_config, - "text_config") - # This is necessary for the auto weight mapper to figure out what it needs. - llm_model_config.pretrained_config.architectures = config.architectures - self.llm = MistralForCausalLM(llm_model_config) - - self._device = "cuda" - # NOTE: current `modelopt` does not support quantizing the vision portion. - vision_model_config = self._get_sub_model_config(model_config, - "vision_config", - quant_config=None) - self._vision_tower = modeling_pixtral.PixtralVisionModel( - vision_model_config) - self._multi_modal_projector = Mistral3MultiModalProjector(model_config) - vision_feature_layer = config.vision_feature_layer + vision_feature_layer = getattr(config, "vision_feature_layer", -1) if vision_feature_layer != -1: raise ValueError( f"Using intermediate layers ({vision_feature_layer}) in the `PixtralVisionModel` " f"is not supported. Please use `vision_feature_layer=-1`.") + self._device = "cuda" self.model_dtype = getattr(config, "torch_dtype", torch.bfloat16) - - self._image_token_ids = torch.tensor([config.image_token_index], + image_token_index = getattr( + config, "image_token_index", None) or getattr( + config.vision_config, "image_token_id", None) + self._image_token_ids = torch.tensor([image_token_index], dtype=torch.int32, device=self._device) + + model_config_cp = copy.deepcopy(model_config) + + llm_model_config = self._get_sub_model_config(model_config_cp, + "text_config") + self.model_config = model_config_cp + llm_class = MistralForCausalLM + if llm_model_config.pretrained_config.architectures[ + 0] == "MistralLarge3ForCausalLM": + llm_class = MistralLarge3ForCausalLM + + llm_model_config.pretrained_config.gate_cls = Mistral3Gate + self.llm = llm_class(llm_model_config) + self.model_config.extra_attrs.update(llm_model_config.extra_attrs) + + # NOTE: current `modelopt` does not support quantizing the vision portion. + # NOTE: attn_backend: Pixtral head size not always divisible by 128 + vision_model_config = self._get_sub_model_config(model_config_cp, + "vision_config", + attn_backend="VANILLA", + quant_config=None) + + self._vision_tower = modeling_pixtral.PixtralVisionModel( + vision_model_config) + self._multi_modal_projector = Mistral3MultiModalProjector( + model_config).eval().to(self._device) self._post_config() + self.is_loaded = True # This is necessary because the executor looks at # `model.model_config.pretrained_config.vocab_size`. @@ -400,18 +424,39 @@ class Mistral3VLM(PreTrainedModel): self.config = self.llm.config self.model_config.pretrained_config = self.llm.config - def load_weights(self, weights: Dict, *args, **kwargs): - llm_weights = _filter_weights(weights, "language_model.") - self.llm.load_weights(llm_weights, *args, **kwargs) + def load_weights(self, weights: Dict, weight_mapper=None, *args, **kwargs): + vit_params_map = None + if weight_mapper: + if isinstance(weight_mapper, MistralWeightMapper): + vit_params_map = weight_mapper.pixtral_mapping - vit_weights = _filter_weights(weights, "vision_tower.") - self._vision_tower.load_weights(vit_weights, *args, **kwargs) + llm_weights = filter_weights(weights=weights, prefix="language_model") + logger.debug(f"Loading weights for {type(self.llm)}") + self.llm.load_weights(llm_weights) + logger.debug(f"Successfully loaded weights for {type(self.llm)}") - mm_projector_weights = _filter_weights(weights, - "multi_modal_projector.") - # `_load_weights_impl` assumes `config.hidden_size` exists, which is not the case for the - # top-level `Mistral3Config`. + vit_weights = filter_weights(weights=weights, prefix="vision_tower") + logger.debug(f"Loading weights for {type(self._vision_tower)}") + + if vit_params_map is not None: + vit_weights = weight_mapper.rename_by_params_map( + weights=vit_weights, params_map=vit_params_map) + + self._vision_tower.load_weights(vit_weights) + logger.debug( + f"Successfully loaded weights for {type(self._vision_tower)}") + + logger.debug(f"Loading weights for {type(self._multi_modal_projector)}") + mm_projector_weights = filter_weights(weights=weights, + prefix="multi_modal_projector") + + if vit_params_map is not None: + mm_projector_weights = weight_mapper.rename_by_params_map( + weights=mm_projector_weights, params_map=vit_params_map) self._multi_modal_projector.load_state_dict(mm_projector_weights) + logger.debug( + f"Successfully loaded weights for {type(self._multi_modal_projector)}" + ) def infer_max_seq_len(self) -> int: return self.llm.infer_max_seq_len() @@ -420,9 +465,10 @@ class Mistral3VLM(PreTrainedModel): def forward( self, attn_metadata: AttentionMetadata, - input_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, + input_ids: torch.LongTensor | None = None, + position_ids: torch.LongTensor | None = None, return_context_logits: bool = False, + spec_metadata: SpecMetadata | None = None, **kwargs, ) -> torch.Tensor: """Forward method.""" @@ -455,6 +501,7 @@ class Mistral3VLM(PreTrainedModel): position_ids=position_ids, inputs_embeds=inputs_embeds, return_context_logits=return_context_logits, + spec_metadata=spec_metadata, ) @staticmethod @@ -465,16 +512,41 @@ class Mistral3VLM(PreTrainedModel): ) -> ModelConfig: # Extract the subconfig from the `transformers` config and shove it into our own # `ModelConfig` class. + assert name in [ + "text_config", "vision_config" + ], f"Expected subconfig name to be either 'text_config' or 'vision_config'. Got {name} instead." + pretrained_config = getattr(model_config.pretrained_config, name) + sub_model_config: ModelConfig[MistralConfig] = dataclasses.replace( model_config, pretrained_config=getattr(model_config.pretrained_config, name), **changes, ) + if name == "text_config": + sub_model_config._frozen = False + sub_model_config.skip_create_weights_in_init = True + if not hasattr( + sub_model_config.pretrained_config, "architectures" + ) or sub_model_config.pretrained_config.architectures is None: + sub_model_config.pretrained_config.architectures = model_config.pretrained_config.architectures + sub_model_config._frozen = True + # Make sure some fields that are not explicitly included in the sub config, but present # in the top-level config, are replicated. if (hasattr(sub_model_config.pretrained_config, "torch_dtype") and sub_model_config.pretrained_config.torch_dtype is None): - sub_model_config.pretrained_config.torch_dtype = model_config.pretrained_config.torch_dtype + sub_model_config.pretrained_config.torch_dtype = model_config.pretrained_config.torch_dtype or torch.bfloat16 + + if name == "vision_config": + pretrained_config = sub_model_config.pretrained_config + defaults = { + "head_dim": pretrained_config.hidden_size // + pretrained_config.num_attention_heads, + "hidden_act": "silu", + } + for attr, default in defaults.items(): + if not hasattr(pretrained_config, attr): + setattr(pretrained_config, attr, default) return sub_model_config @@ -572,6 +644,12 @@ class Mistral3VLM(PreTrainedModel): def mm_token_ids(self): return self._image_token_ids + def load_draft_weights( + self, + weights: Dict, + weight_mapper: MistralWeightMapper | None = None) -> None: + self.llm.load_draft_weights(weights, weight_mapper=weight_mapper) + # Original implementation: # https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/mistral3/modeling_mistral3.py#L66 @@ -586,13 +664,15 @@ class Mistral3PatchMerger(torch.nn.Module): self.config = config hidden_size = config.vision_config.hidden_size - self._spatial_merge_size = config.spatial_merge_size + self._spatial_merge_size = getattr( + config, "spatial_merge_size", None) or getattr( + config.vision_config, "spatial_merge_size") self._patch_size = config.vision_config.patch_size self.merging_layer = Linear( in_features=hidden_size * self._spatial_merge_size**2, out_features=hidden_size, bias=False, - dtype=config.torch_dtype, + dtype=config.torch_dtype or model_config.torch_dtype, mapping=model_config.mapping, ) @@ -640,7 +720,7 @@ class Mistral3MultiModalProjector(torch.nn.Module): self.model_config = model_config self.config = config - dtype = config.torch_dtype + dtype = config.torch_dtype or model_config.torch_dtype self.norm = RMSNorm( hidden_size=config.vision_config.hidden_size, # NOTE: the original implementation actually does not look at the config for this value. @@ -650,21 +730,21 @@ class Mistral3MultiModalProjector(torch.nn.Module): ) self.patch_merger = Mistral3PatchMerger(model_config) # We have hidden_size * the number of vision feature layers - num_feature_layers = 1 if isinstance(config.vision_feature_layer, - int) else len( - config.vision_feature_layer) + vision_feature_layer = getattr(config, "vision_feature_layer", -1) + num_feature_layers = 1 if isinstance(vision_feature_layer, + int) else len(vision_feature_layer) self.linear_1 = Linear( in_features=config.vision_config.hidden_size * num_feature_layers, out_features=config.text_config.hidden_size, - bias=config.multimodal_projector_bias, + bias=getattr(config, "multimodal_projector_bias", None), dtype=dtype, mapping=model_config.mapping, ) - self.act = ACT2FN[config.projector_hidden_act] + self.act = ACT2FN[getattr(config, "projector_hidden_act", "gelu")] self.linear_2 = Linear( in_features=config.text_config.hidden_size, out_features=config.text_config.hidden_size, - bias=config.multimodal_projector_bias, + bias=getattr(config, "multimodal_projector_bias", None), dtype=dtype, mapping=model_config.mapping, ) diff --git a/tensorrt_llm/_torch/models/modeling_mistral_large3.py b/tensorrt_llm/_torch/models/modeling_mistral_large3.py new file mode 100644 index 0000000000..c88cebdf05 --- /dev/null +++ b/tensorrt_llm/_torch/models/modeling_mistral_large3.py @@ -0,0 +1,70 @@ +from typing import Dict, List + +import torch +from torch import nn + +from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm._torch.models.checkpoints.mistral.weight_mapper import MistralLarge3WeightMapper +from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3ForCausalLM +from tensorrt_llm._torch.models.modeling_utils import register_auto_model +from tensorrt_llm._torch.modules.fused_moe import RenormalizeNaiveMoeRoutingMethod +from tensorrt_llm.quantization.mode import QuantAlgo + + +class Mistral3Gate(nn.Module): + def __init__( + self, + hidden_size: int, + num_experts: int, + top_k: int, + dtype: torch.dtype | None = None, + **kwargs, + ): + super().__init__() + self.weight = nn.Parameter( + torch.empty((num_experts, hidden_size), dtype=dtype), requires_grad=False + ) + self.top_k = top_k + self.dtype = dtype + self.routing_method = RenormalizeNaiveMoeRoutingMethod(top_k=self.top_k) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits: torch.Tensor = torch.ops.trtllm.cublas_mm( + hidden_states, self.weight.t(), bias=None, out_dtype=self.dtype + ) + return logits + + def load_weights(self, weights: List[Dict]): + assert len(weights) == 1 + + self.weight.copy_(weights[0]["weight"][:]) + + +@register_auto_model("MistralLarge3ForCausalLM") +class MistralLarge3ForCausalLM(DeepseekV3ForCausalLM): + def __init__(self, model_config: ModelConfig): + super().__init__(model_config) + self.weight_mapper = MistralLarge3WeightMapper() + + def forward(self, *args, **kwargs): + return super().forward(*args, **kwargs) + + def load_weights(self, weights: Dict): + assert self.model_config is not None, "self.model_config is required" + params_map = self.weight_mapper.mistral_llm_mapping.copy() + quantization_weights_map: Dict[str, str] = {} + if self.model_config.quant_config.quant_algo == QuantAlgo.NVFP4: + quantization_weights_map = { + "weight_packed": "weight", + "input_global_scale": "input_scale", + "weight_global_scale": "weight_scale_2", + } + elif self.model_config.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES: + quantization_weights_map = { + "weight_scale": "weight_scale_inv", + } + if quantization_weights_map: + params_map.update(quantization_weights_map) + weights = self.weight_mapper.rename_by_params_map(weights=weights, params_map=params_map) + + super().load_weights(weights) diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index 6013d51fa2..e4fa9da6e6 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -38,14 +38,22 @@ _CONFIG_REGISTRY: dict[str, type[transformers.PretrainedConfig]] = LazyConfigDic def load_pretrained_config(model_name_or_path: str, trust_remote_code: bool = False, + checkpoint_format: str = None, **kwargs) -> transformers.PretrainedConfig: config_dict, _ = transformers.PretrainedConfig.get_config_dict( model_name_or_path, **kwargs) model_type = config_dict.get("model_type") + if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] model_config = config_class.from_pretrained(model_name_or_path, **kwargs) + elif checkpoint_format in ("mistral", "mistral_large_3"): + from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import \ + MistralConfigLoader + model_config = getattr( + MistralConfigLoader().load(model_name_or_path).pretrained_config, + "text_config") else: model_config = transformers.AutoConfig.from_pretrained( model_name_or_path, trust_remote_code=trust_remote_code) diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py index 7737600e6f..54902a5ba3 100644 --- a/tensorrt_llm/inputs/registry.py +++ b/tensorrt_llm/inputs/registry.py @@ -600,6 +600,12 @@ def create_input_processor( logger.debug( f"Unable to load HF config from {model_path_or_dir}: {e}. Falling back." ) + elif checkpoint_format in ("mistral", "mistral_large_3"): + logger.debug(f"Detected checkpoint_format={checkpoint_format}.") + from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import \ + MistralConfigLoader + model_config = MistralConfigLoader().load(model_path_or_dir) + config = model_config.pretrained_config else: logger.debug( f"checkpoint_format={checkpoint_format}; skipping HF config load.") diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 70285d0aea..c9699bb91f 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -112,7 +112,8 @@ class OpenAIServer: from tensorrt_llm._torch.pyexecutor.config_utils import \ load_pretrained_config self.model_config = load_pretrained_config(hf_tokenizer_path, - trust_remote_code=trust_remote_code) + trust_remote_code=trust_remote_code, + checkpoint_format=getattr(self.llm.args, "checkpoint_format", None)) except Exception: logger.debug("Failed to load AutoConfig for %s", hf_tokenizer_path) self.model_config = None diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index c62ff5a0d8..33f7dddc6b 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -281,3 +281,5 @@ bigcode/starcoder2-7b: - accuracy: 26.5 bigcode/starcoder2-15b: - accuracy: 54.5 +mistral/Mistral-Large-3-675B: + - accuracy: 90.83 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index dd404ba8f7..f728919abe 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -340,3 +340,5 @@ mistralai/Mistral-Nemo-12b-Base: - accuracy: 69.66 - quant_algo: FP8 accuracy: 69.66 +mistral/Mistral-Large-3-675B: + - accuracy: 87.54 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index f4bb84ae63..3b667b15c9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4828,3 +4828,107 @@ class TestLlama3_1_8B_Instruct_RocketKV(LlmapiAccuracyTestHarness): task.evaluate(llm, sampling_params=sampling_params, extra_evaluator_kwargs=extra_evaluator_kwargs) + + +class TestMistralLarge3_675B(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistral/Mistral-Large-3-675B" + + @skip_pre_blackwell + @pytest.mark.skip_less_mpi_world_size(4) + @pytest.mark.skip_less_device_memory(183000) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", + [ + (4, 1, 4, False, True, True, "TRTLLM", False), + ], + ids=[ + "latency_moe_trtllm", + ], + ) + def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, + cuda_graph, overlap_scheduler, moe_backend, eagle3): + + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + enable_block_reuse=not eagle3) + spec_config = None + if eagle3: + spec_config = EagleDecodingConfig( + max_draft_len=2, + speculative_model_dir= + f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-Eagle/", + eagle3_one_model=True) + with LLM( + f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-NVFP4/", + checkpoint_format="mistral", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_blackwell + @pytest.mark.skip_less_mpi_world_size(8) + @pytest.mark.skip_less_device_memory(183000) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", + [ + (8, 1, 8, False, True, True, "DEEPGEMM", False), + ], + ids=[ + "latency_moe_deepgemm", + ], + ) + def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, + overlap_scheduler, moe_backend, eagle3): + + if moe_backend == "DEEPGEMM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE DEEPGEMM backend does not support SM version 120 or 121") + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + enable_block_reuse=not eagle3) + spec_config = None + if eagle3: + spec_config = EagleDecodingConfig( + max_draft_len=2, + speculative_model_dir= + f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512-Eagle/", + eagle3_one_model=True) + with LLM( + f"{llm_models_root()}/Mistral-Large-3-675B/Mistral-Large-3-675B-Instruct-2512/", + checkpoint_format="mistral", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index ccd23bdf08..f54045dd16 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -109,6 +109,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90) - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index 447e989f54..b53a64c61b 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -91,3 +91,4 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8] - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90) + - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90) From 7fc720a3973d3ef226ac3edcb4e506804caeba66 Mon Sep 17 00:00:00 2001 From: shuyixiong <219646547+shuyixiong@users.noreply.github.com> Date: Sat, 13 Dec 2025 14:10:01 +0800 Subject: [PATCH 113/172] [TRTLLM-9784][fix] Resolve port conflicts (#9780) Signed-off-by: Shuyi Xiong <219646547+shuyixiong@users.noreply.github.com> --- tensorrt_llm/executor/ray_executor.py | 18 +++++- tensorrt_llm/executor/ray_gpu_worker.py | 73 ++++++++++++++++++++----- 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py index e03f524bea..0fc4fa2810 100644 --- a/tensorrt_llm/executor/ray_executor.py +++ b/tensorrt_llm/executor/ray_executor.py @@ -13,7 +13,7 @@ from ray.util.placement_group import (PlacementGroupSchedulingStrategy, placement_group) from tensorrt_llm._ray_utils import unwrap_ray_errors -from tensorrt_llm._utils import get_free_port, nvtx_range_debug +from tensorrt_llm._utils import nvtx_range_debug from tensorrt_llm.logger import logger from ..llmapi.utils import logger_debug @@ -76,7 +76,6 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): self.world_size = model_world_size self.tp_size = tp_size self.master_address = ray.util.get_node_ip_address() - self.master_port = get_free_port() self.worker_kwargs = dict( **worker_kwargs, @@ -126,7 +125,6 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): runtime_env["env_vars"].update({ "TLLM_DISABLE_MPI": "1", "MASTER_ADDR": self.master_address, # head-IP for NCCL/Gloo - "MASTER_PORT": str(self.master_port) }) placement_groups, self.bundle_indices = self._get_placement_group( @@ -156,6 +154,13 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): ray.get(self._get_worker_ready_futures()) except ray.exceptions.ActorDiedError as e: raise RuntimeError("RayGPUWorker died during initialization") from e + port = self.call_all_ray_workers("setup_tcp_store", + leader_only=True, + async_call=False)[0] + self.call_all_ray_workers("setup_distributed_env_and_worker", + leader_only=False, + async_call=False, + port=port) async def init_workers_async(self): self.create_workers(RayGPUWorker, self.worker_kwargs) @@ -163,6 +168,13 @@ class RayExecutor(RpcExecutorMixin, GenerationExecutor): await asyncio.gather(*self._get_worker_ready_futures()) except ray.exceptions.ActorDiedError as e: raise RuntimeError("RayGPUWorker died during initialization") from e + port = (await asyncio.gather(*self.call_all_ray_workers( + "setup_tcp_store", leader_only=True, async_call=True)))[0] + await asyncio.gather( + *self.call_all_ray_workers("setup_distributed_env_and_worker", + leader_only=False, + async_call=True, + port=port)) @unwrap_ray_errors() def call_all_ray_workers(self, func: str, leader_only: bool, diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py index fca5386cb5..864d23d3af 100644 --- a/tensorrt_llm/executor/ray_gpu_worker.py +++ b/tensorrt_llm/executor/ray_gpu_worker.py @@ -1,6 +1,7 @@ import gc import importlib import os +from functools import wraps from pathlib import Path from queue import Queue from typing import Any, List, Optional, Type, Union @@ -43,7 +44,8 @@ class RayWorkerWrapper: def __init__(self, worker_cls, worker_kwargs, world_size, rank): self.master_address = os.environ["MASTER_ADDR"] - self.master_port = os.environ["MASTER_PORT"] + self.world_size = world_size + self.rank = rank # Ray can't pickle TensorRT logger global logger from tensorrt_llm.logger import logger @@ -55,39 +57,83 @@ class RayWorkerWrapper: # Physical gpu id self.gpu = int(ray.get_gpu_ids()[0]) - local_gpu = self.physical_to_local_id(self.gpu) + self.local_gpu = self.physical_to_local_id(self.gpu) - torch.distributed.init_process_group( - backend="cuda:nccl,cpu:gloo", - init_method=f"tcp://{self.master_address}:{self.master_port}", - world_size=world_size, - rank=rank) + torch.cuda.set_device(self.local_gpu) + self.worker_cls = RayWorkerWrapper._inject_worker_extension( + worker_cls, worker_kwargs.pop("ray_worker_extension_cls", None)) + self.worker_kwargs = worker_kwargs + + def _create_tcp_store(self, + port: Optional[int] = None + ) -> torch.distributed.TCPStore: + # port=0 means let the OS pick an available port (only valid for master) + # For non-master, port must be specified to connect to master's port + actual_port = port if port is not None else 0 + return torch.distributed.TCPStore(host_name=self.master_address, + port=actual_port, + world_size=self.world_size, + is_master=(self.rank == 0), + wait_for_workers=False) + + def setup_tcp_store(self): + if self.rank != 0: + raise RuntimeError("Only the master worker can setup TCP store") + self.store = self._create_tcp_store() + return self.store.port + + def setup_distributed_env_and_worker(self, port: int): + if self.rank != 0: + self.store = self._create_tcp_store(port) + + torch.distributed.init_process_group(backend="cuda:nccl,cpu:gloo", + store=self.store, + world_size=self.world_size, + rank=self.rank) logger.info( - f"[Rank {rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {local_gpu}" + f"[Rank {self.rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {self.local_gpu}" ) - torch.cuda.set_device(local_gpu) + self.worker = self.worker_cls(device_id=self.local_gpu, + **self.worker_kwargs) + self._has_setup_distributed_env_and_worker = True - worker_cls = RayWorkerWrapper._inject_worker_extension( - worker_cls, worker_kwargs.pop("ray_worker_extension_cls", None)) - self.worker = worker_cls(device_id=local_gpu, **worker_kwargs) + @property + def has_setup_distributed_env_and_worker(self) -> bool: + return getattr(self, '_has_setup_distributed_env_and_worker', False) + def ensure_distributed_setup(func): + + @wraps(func) + def wrapper(self, *args, **kwargs): + if not self.has_setup_distributed_env_and_worker: + raise RuntimeError( + "Have not setup distributed environment and worker yet") + return func(self, *args, **kwargs) + + return wrapper + + @ensure_distributed_setup def submit(self, request: GenerationRequest) -> GenerationResult: return self.worker.submit(request) + @ensure_distributed_setup def enqueue_request(self, request: GenerationRequest, result_wait_queue: Queue | None = None) -> int: return self.worker.enqueue_request(request, result_wait_queue) + @ensure_distributed_setup def abort_request(self, request_id: int) -> None: self.worker.abort_request(request_id) + @ensure_distributed_setup def report_device_id(self) -> str: local_id = self.physical_to_local_id(self.gpu) return get_device_uuid(local_id) + @ensure_distributed_setup def call_worker_method(self, method_name: str, *args, **kwargs): """Generic method to call any method on the underlying worker.""" if hasattr(self.worker, method_name): @@ -103,7 +149,8 @@ class RayWorkerWrapper: f"The RayGPUWorker has no method called '{method_name}'.") def shutdown(self): - return self.worker.shutdown() + if hasattr(self, 'worker'): + self.worker.shutdown() def __repr__(self) -> str: """Customizes the actor's prefix in the Ray logs. From 6a6e41f802d92614f24afc37903cdc8b2aee0556 Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Fri, 12 Dec 2025 22:29:41 -0800 Subject: [PATCH 114/172] [TRTLLM-9468][chore] Update disagg benchmarking scripts to support context parallelism (#9720) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- .../disaggregated/slurm/benchmark/README.md | 5 +++-- .../disaggregated/slurm/benchmark/config.yaml | 10 +++++++++- .../slurm/benchmark/disaggr_torch.slurm | 19 ++++++++++++++++++ .../slurm/benchmark/gen_server_config.py | 20 ++++++++++++++----- .../disaggregated/slurm/benchmark/submit.py | 9 +++++++-- .../_torch/attention_backend/trtllm.py | 13 ++++-------- .../_torch/pyexecutor/model_engine.py | 5 +++++ tensorrt_llm/llmapi/llm_args.py | 18 +++++++++++++++++ tensorrt_llm/serve/openai_disagg_service.py | 13 ++++++++++-- .../accuracy/test_disaggregated_serving.py | 5 ++++- 10 files changed, 95 insertions(+), 22 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/README.md b/examples/disaggregated/slurm/benchmark/README.md index 1d11aba581..29b7301f08 100644 --- a/examples/disaggregated/slurm/benchmark/README.md +++ b/examples/disaggregated/slurm/benchmark/README.md @@ -175,15 +175,16 @@ Results are automatically organized in the work directory: ### Benchmark Modes -The system supports two primary benchmark modes: +The system supports three primary benchmark modes: 1. **End-to-End (e2e)**: Tests the complete disaggregated inference pipeline including both context processing and token generation phases 2. **Generation Only (gen_only)**: Focuses solely on testing the generation phase with pre-cached KV data +3. **Generation Only No Context (gen_only_no_context)**: Skips launching context workers entirely by setting `TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1`. This is useful when you only want to benchmark the generation phase without allocating resources for context workers. Configure the mode in the YAML file: ```yaml benchmark: - mode: "e2e" # or "gen_only" + mode: "e2e" # or "gen_only" or "gen_only_no_context" ``` ### Metrics Collection diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index dde6576d97..afe7282348 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -11,7 +11,7 @@ slurm: # Benchmark Mode benchmark: - mode: "e2e" # Options: e2e, gen_only + mode: "e2e" # Options: e2e, gen_only, gen_only_no_context use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script multi_round: 8 # Number of benchmark rounds benchmark_ratio: 0.8 # Benchmark ratio @@ -34,6 +34,7 @@ environment: model_path: "" trtllm_repo: "" build_wheel: false # Don't build the wheel when launching multiple jobs + cuda_architectures: "" # Optional CUDA architectures to build for (e.g. "90-real;100-real"). If empty, builds for all architectures trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead work_dir: "" worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" @@ -59,6 +60,11 @@ worker_config: enable_attention_dp: true enable_lm_head_tp_in_adp: true pipeline_parallel_size: 1 + context_parallel_size: 1 + # Uncomment this section to enable context parallelism. + # cp_config: + # cp_type: "HELIX" + # tokens_per_block: 32 # must match kv_config.tokens_per_block. max_batch_size: 256 max_num_tokens: 512 max_seq_len: 2251 @@ -83,6 +89,7 @@ worker_config: trust_remote_code: true kv_cache_config: enable_block_reuse: false + tokens_per_block: 32 free_gpu_memory_fraction: 0.8 dtype: fp8 moe_config: @@ -103,6 +110,7 @@ worker_config: max_num_tokens: 4608 max_seq_len: 1227 tensor_parallel_size: 4 + context_parallel_size: 1 moe_expert_parallel_size: 4 enable_attention_dp: true pipeline_parallel_size: 1 diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index 0e2c7e64d8..e811290230 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -26,6 +26,7 @@ while [[ $# -gt 0 ]]; do --container-mount) container_mount="$2"; shift 2 ;; --container-image) container_image="$2"; shift 2 ;; --build-wheel) build_wheel="$2"; shift 2 ;; + --cuda-architectures) cuda_architectures="$2"; shift 2 ;; --trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;; # Accuracy evaluation @@ -79,6 +80,14 @@ echo echo "Server Environment Variables:" echo " server_env_var: ${server_env_var}" +# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode +if [ "${benchmark_mode}" = "gen_only_no_context" ]; then + export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 + worker_env_var="${worker_env_var} TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1" + server_env_var="${server_env_var} TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1" + echo "Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode" +fi + container_name="disaggr-test" # Function to cleanup on failure @@ -122,6 +131,9 @@ elif [ -d "${trtllm_repo}" ]; then if [ "${build_wheel}" = "true" ]; then echo "Building TensorRT-LLM wheel on one node..." build_command="python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt --benchmarks --use_ccache --clean" + if [ -n "${cuda_architectures:-}" ]; then + build_command="${build_command} --cuda_architectures \"${cuda_architectures}\"" + fi if ! srun --container-name=${container_name} \ --container-mounts=${container_mount} \ --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \ @@ -160,8 +172,15 @@ for i in "${!node_array[@]}"; do echo "Replaced $placeholder with $current_val" done +# start the workers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set). echo "Starting worker commands from ${start_worker_cmds_file}..." cat ${start_worker_cmds_file} | while read cmd; do + # Skip ctx worker commands if in gen-only mode + # CTX appears as argument to start_worker.sh and in log filename + if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" = "1" ] && [[ "$cmd" == *"start_worker.sh CTX"* ]]; then + echo "Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd}" + continue + fi echo "Starting worker command: ${cmd}" eval "${cmd}" done diff --git a/examples/disaggregated/slurm/benchmark/gen_server_config.py b/examples/disaggregated/slurm/benchmark/gen_server_config.py index 478d7bf22b..fcab212b42 100644 --- a/examples/disaggregated/slurm/benchmark/gen_server_config.py +++ b/examples/disaggregated/slurm/benchmark/gen_server_config.py @@ -35,12 +35,17 @@ if __name__ == "__main__": time.sleep(10) print(f"Waiting for hostnames folder {hostnames_folder} to be found") hostnames = os.listdir(hostnames_folder) - # check length of hostnames is equal to num_ctx_servers + num_gen_servers, if not, sleep 10 seconds and check again - while len(hostnames) != args.num_ctx_servers + args.num_gen_servers: + + # Skip context servers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set + gen_only = os.getenv("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY") == "1" + expected_hostnames = args.num_gen_servers if gen_only else args.num_ctx_servers + args.num_gen_servers + + # check length of hostnames is equal to expected count, if not, sleep 10 seconds and check again + while len(hostnames) != expected_hostnames: time.sleep(10) hostnames = os.listdir(hostnames_folder) print( - f"Waiting for hostnames to be found in {hostnames_folder}, current length: {len(hostnames)}, expected length: {args.num_ctx_servers + args.num_gen_servers}" + f"Waiting for hostnames to be found in {hostnames_folder}, current length: {len(hostnames)}, expected length: {expected_hostnames}" ) print(f"All hostnames found in {hostnames_folder}") @@ -65,13 +70,18 @@ if __name__ == "__main__": hostname = socket.gethostname() print(f"Current hostname: {hostname}") + # Skip context servers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set + gen_only = os.getenv("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY") == "1" + server_config = { 'hostname': hostname, 'port': args.server_port, 'backend': 'pytorch', 'context_servers': { - 'num_instances': args.num_ctx_servers, - 'urls': ctx_urls + 'num_instances': + 0 if gen_only else args.num_ctx_servers, + 'urls': [] if gen_only else + [f'{host}:{args.worker_port}' for host in ctx_hostnames] }, 'generation_servers': { 'num_instances': args.num_gen_servers, diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index d605c9fd59..5263dadc2f 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -133,6 +133,7 @@ def submit_job(config, log_dir): # Set default environment configuration for backward compatibility env_config.setdefault('trtllm_repo', '') env_config.setdefault('build_wheel', False) + env_config.setdefault('cuda_architectures', '') env_config.setdefault('trtllm_wheel_path', '') env_config.setdefault('worker_env_var', '') env_config.setdefault('server_env_var', '') @@ -154,12 +155,14 @@ def submit_job(config, log_dir): # Calculate nodes based on world sizes ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size'] + ctx_cp_size = config['worker_config']['ctx']['context_parallel_size'] ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size'] - ctx_world_size = ctx_tp_size * ctx_pp_size + ctx_world_size = ctx_tp_size * ctx_cp_size * ctx_pp_size ctx_nodes = calculate_nodes(ctx_world_size, ctx_num, gpus_per_node) gen_tp_size = config['worker_config']['gen']['tensor_parallel_size'] + gen_cp_size = config['worker_config']['gen']['context_parallel_size'] gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size'] - gen_world_size = gen_tp_size * gen_pp_size + gen_world_size = gen_tp_size * gen_cp_size * gen_pp_size gen_nodes = calculate_nodes(gen_world_size, gen_num, gpus_per_node) total_nodes = ctx_nodes + gen_nodes total_tasks = total_nodes * gpus_per_node @@ -284,6 +287,7 @@ def submit_job(config, log_dir): *([] if not slurm_config['set_segment'] else [f'--segment={total_nodes}']), f'--output={log_dir}/slurm-%j.out', f'--error={log_dir}/slurm-%j.err', + f'--gpus-per-node={hw_config["gpus_per_node"]}', *([arg for arg in slurm_config['extra_args'].split() if arg]), slurm_config['script_file'], @@ -309,6 +313,7 @@ def submit_job(config, log_dir): '--container-mount', env_config['container_mount'], '--container-image', env_config['container_image'], '--build-wheel', str(env_config['build_wheel']).lower(), + '--cuda-architectures', env_config['cuda_architectures'], '--trtllm-wheel-path', env_config['trtllm_wheel_path'], # Accuracy evaluation diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index 0623be8e65..fc4c7136f9 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -1878,16 +1878,11 @@ class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): assert metadata.kv_cache_manager is not None sink_token_length = 0 - # Ensure helix_is_inactive_rank is on the same device as other tensors. + # Ensure helix_is_inactive_rank and position_ids are on the same device. if helix_is_inactive_rank is not None: - if isinstance(helix_is_inactive_rank, list): - helix_is_inactive_rank = torch.tensor( - helix_is_inactive_rank, - dtype=torch.bool, - device=helix_position_offsets.device) - elif helix_is_inactive_rank.device.type != 'cuda': - helix_is_inactive_rank = helix_is_inactive_rank.to( - helix_position_offsets.device) + assert helix_is_inactive_rank.device == helix_position_offsets.device, \ + f"helix_is_inactive_rank must be on the same device as helix_position_offsets, " \ + f"got {helix_is_inactive_rank.device} vs {helix_position_offsets.device}" mla_tensor_params = [helix_position_offsets, helix_is_inactive_rank] diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 6d49804e20..be6ae4bf3c 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -2014,6 +2014,11 @@ class PyTorchModelEngine(ModelEngine): attn_metadata.request_ids = request_ids attn_metadata.prompt_lens = prompt_lengths + if helix_is_inactive_rank is not None and len( + helix_is_inactive_rank) > 0: + helix_is_inactive_rank = torch.tensor(helix_is_inactive_rank, + dtype=torch.bool, + device='cuda') attn_metadata.helix_is_inactive_rank = helix_is_inactive_rank attn_metadata.num_contexts = len(scheduled_requests.context_requests) # Use num_chunked_ctx_requests to record the number of extend context requests, diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 30bfa3675a..cd7858c6f4 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -3066,6 +3066,24 @@ class TorchLlmArgs(BaseLlmArgs): return self + @model_validator(mode='after') + def validate_helix_tokens_per_block(self) -> 'TorchLlmArgs': + """Validate that cp_config.tokens_per_block matches kv_cache_config.tokens_per_block when HELIX parallelism is active.""" + if self.context_parallel_size == 1 or self.cp_config is None or not self.cp_config: + return self + + cp_type = self.cp_config.get('cp_type', None) + if cp_type is not None and str(cp_type).upper() == 'HELIX': + cp_tokens_per_block = self.cp_config.get('tokens_per_block', None) + if cp_tokens_per_block is not None: + kv_tokens_per_block = self.kv_cache_config.tokens_per_block + assert cp_tokens_per_block == kv_tokens_per_block, ( + f"When HELIX parallelism is active, cp_config.tokens_per_block ({cp_tokens_per_block}) " + f"must match kv_cache_config.tokens_per_block ({kv_tokens_per_block})." + ) + + return self + def warn_on_unstable_feature_usage(self) -> 'TorchLlmArgs': """Warn on unstable feature usage.""" set_fields = self.model_dump(exclude_unset=True).keys() diff --git a/tensorrt_llm/serve/openai_disagg_service.py b/tensorrt_llm/serve/openai_disagg_service.py index d1f8d8dad7..a0012bd6d3 100644 --- a/tensorrt_llm/serve/openai_disagg_service.py +++ b/tensorrt_llm/serve/openai_disagg_service.py @@ -250,14 +250,23 @@ class OpenAIDisaggregatedService(OpenAIService): await self._gen_router.stop_server_monitoring() async def _wait_for_all_servers_ready(self) -> None: + # Skip context servers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set + gen_only = os.getenv("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY") == "1" + async def check_servers_ready(): elapsed_time = 0 interval = self._health_check_interval_secs while elapsed_time < self._server_start_timeout_secs: - _, unready_ctx_servers = await self._ctx_client.check_ready() + if gen_only: + unready_ctx_servers = [] + else: + _, unready_ctx_servers = await self._ctx_client.check_ready() _, unready_gen_servers = await self._gen_client.check_ready() if len(unready_ctx_servers) == 0 and len(unready_gen_servers) == 0: - logger.info("All servers are ready") + if gen_only: + logger.info("Generation servers are ready (context servers skipped)") + else: + logger.info("All servers are ready") return logger.info( f"Waiting for servers, context: {unready_ctx_servers}, generation: {unready_gen_servers}" diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 894114c0f4..31f04f9968 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -863,7 +863,10 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): "disable_overlap_scheduler": True, "kv_cache_config": kv_cache_config, "enable_chunked_prefill": False, - "cuda_graph_config": None, + "cuda_graph_config": { + "enable_padding": True, + "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128] + }, "cache_transceiver_config": { "backend": "UCX" }, From 8cbf2d958c0bc57fe6e10047c6b919979c83db67 Mon Sep 17 00:00:00 2001 From: shuyixiong <219646547+shuyixiong@users.noreply.github.com> Date: Sat, 13 Dec 2025 17:02:11 +0800 Subject: [PATCH 115/172] [TRTLLM-9738][chore] Guard accuracy with nccl allreduce strategy (#9793) Signed-off-by: Shuyi Xiong <219646547+shuyixiong@users.noreply.github.com> --- .../test_accuracy_with_allreduce_strategy.py | 408 ++++++++++++++++++ .../ray_orchestrator/multi_gpu/test_ops.py | 9 +- 2 files changed, 411 insertions(+), 6 deletions(-) create mode 100644 tests/unittest/_torch/ray_orchestrator/multi_gpu/test_accuracy_with_allreduce_strategy.py diff --git a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_accuracy_with_allreduce_strategy.py b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_accuracy_with_allreduce_strategy.py new file mode 100644 index 0000000000..765bd7f5f4 --- /dev/null +++ b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_accuracy_with_allreduce_strategy.py @@ -0,0 +1,408 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import os +from functools import partial +from typing import List, Tuple + +import pytest +import ray +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils.llm_data import llm_models_root + +from tensorrt_llm import LLM +from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams + + +class HFModel: + def __init__(self, model_name: str, device_id: int): + self.device_id = device_id + self.model = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=torch.bfloat16 + ).to(f"cuda:{device_id}") + + def generate_batch_with_padding( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + responses: List[List[int]], + prompt_max_len: int = 1024, + micro_batch_size: int = 16, + ): + """ + Synchronous inference on a batch with micro-batching. + Directly extracts response logprobs to save memory. + + Args: + input_ids: [batch_size, seq_len] + attention_mask: [batch_size, seq_len] + position_ids: [batch_size, seq_len] + responses: List of response token IDs for each sample + prompt_max_len: Maximum prompt length (default 1024) + micro_batch_size: Size of each micro batch to avoid OOM + + Returns: + List of logprobs tensors, one per sample [response_len] + """ + # Move tensors to the correct device + input_ids = input_ids.to(f"cuda:{self.device_id}") + attention_mask = attention_mask.to(f"cuda:{self.device_id}") + position_ids = position_ids.to(f"cuda:{self.device_id}") + + batch_size = input_ids.shape[0] + num_micro_batches = (batch_size + micro_batch_size - 1) // micro_batch_size + + all_response_logprobs = [] + + with torch.no_grad(): + for micro_idx in range(num_micro_batches): + start_idx = micro_idx * micro_batch_size + end_idx = min((micro_idx + 1) * micro_batch_size, batch_size) + + # Extract micro batch + micro_input_ids = input_ids[start_idx:end_idx] + micro_attention_mask = attention_mask[start_idx:end_idx] + micro_position_ids = position_ids[start_idx:end_idx] + + # Forward pass + outputs = self.model( + input_ids=micro_input_ids, + attention_mask=micro_attention_mask, + position_ids=micro_position_ids, + ) + + # Extract response logprobs for each sample in this micro batch + micro_logits = outputs.logits # [micro_batch_size, seq_len, vocab_size] + + for i in range(micro_logits.shape[0]): + sample_idx = start_idx + i + response = responses[sample_idx] + response_len = len(response) + + # Extract logits for predicting response tokens + # For predicting response[j], we need logits at position prompt_max_len-1+j + response_logits = micro_logits[ + i, prompt_max_len - 1 : prompt_max_len - 1 + response_len, : + ] + + # Convert to logprobs + response_logprobs = torch.log_softmax(response_logits, dim=-1) + + # Extract logprobs for the actual generated tokens + response_tensor = torch.tensor( + response, dtype=torch.long, device=response_logprobs.device + ) + ref_logprob_for_tokens = torch.gather( + response_logprobs, dim=-1, index=response_tensor.unsqueeze(-1) + ).squeeze(-1) + + all_response_logprobs.append(ref_logprob_for_tokens) + + # Free memory immediately after processing each micro batch + del outputs, micro_logits + torch.cuda.empty_cache() + + return all_response_logprobs + + +async def generate_batch_async( + hf_model: HFModel, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + responses: List[List[int]], + prompt_max_len: int = 1024, + micro_batch_size: int = 16, +) -> List[torch.Tensor]: + """ + Async wrapper for generate_batch_with_padding. + Runs the synchronous model inference in a thread pool. + + Args: + hf_model: HFModel instance + input_ids: Input token IDs + attention_mask: Attention mask + position_ids: Position IDs + responses: List of response token IDs for each sample + prompt_max_len: Maximum prompt length + micro_batch_size: Size of micro batches for processing + + Returns: + List of logprobs tensors, one per sample + """ + loop = asyncio.get_event_loop() + + func = partial( + hf_model.generate_batch_with_padding, + prompt_max_len=prompt_max_len, + micro_batch_size=micro_batch_size, + ) + + result = await loop.run_in_executor( + None, # Use default executor + func, + input_ids, + attention_mask, + position_ids, + responses, + ) + return result + + +def pad_data( + original_prompts: List[List[int]], + generated_token_ids_list: List[List[int]], + prompt_max_len: int = 1024, + response_max_len: int = 1024, + pad_token_id: int = 0, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Pad the data to the maximum length. + + Structure: + [left_pad | actual_prompt | actual_response | right_pad] + |<-- prompt_max_len=1024 -->|<-- response_max_len=1024 -->| + + Args: + original_prompts: List of prompt token IDs, len = batch_size + generated_token_ids_list: List of response token IDs, len = batch_size + prompt_max_len: Maximum length for prompt section (default 1024) + response_max_len: Maximum length for response section (default 1024) + pad_token_id: Token ID for padding (default 0) + Returns: + input_ids: Tensor of shape [batch_size, prompt_max_len + response_max_len] + attention_mask: Tensor of shape [batch_size, prompt_max_len + response_max_len] + position_ids: Tensor of shape [batch_size, prompt_max_len + response_max_len] + """ + batch_size = len(original_prompts) + total_len = prompt_max_len + response_max_len + + for i, (prompt, response) in enumerate(zip(original_prompts, generated_token_ids_list)): + assert len(prompt) <= prompt_max_len, ( + f"Batch {i}: Prompt length {len(prompt)} exceeds max {prompt_max_len}" + ) + assert len(response) <= response_max_len, ( + f"Batch {i}: Response length {len(response)} exceeds max {response_max_len}" + ) + + # Build batch tensors [batch_size, 2048] + batch_input_ids = torch.full( + (batch_size, total_len), pad_token_id, dtype=torch.long, device="cuda" + ) + batch_attention_mask = torch.zeros((batch_size, total_len), dtype=torch.long, device="cuda") + batch_position_ids = torch.zeros((batch_size, total_len), dtype=torch.long, device="cuda") + + response_lens = [] + + for i in range(batch_size): + prompt_tokens = original_prompts[i] + response_tokens = generated_token_ids_list[i] + + prompt_len = len(prompt_tokens) + response_len = len(response_tokens) + response_lens.append(response_len) + + left_pad_len = prompt_max_len - prompt_len + + # Fill input_ids: [left_pad | prompt | response | right_pad] + prompt_start = left_pad_len + prompt_end = prompt_max_len + response_start = prompt_max_len + response_end = prompt_max_len + response_len + + batch_input_ids[i, prompt_start:prompt_end] = torch.tensor( + prompt_tokens, dtype=torch.long, device="cuda" + ) + batch_input_ids[i, response_start:response_end] = torch.tensor( + response_tokens, dtype=torch.long, device="cuda" + ) + + # Fill attention_mask: 1 for actual tokens, 0 for padding + batch_attention_mask[i, prompt_start:response_end] = 1 + + # Fill position_ids: sequential for actual tokens + actual_seq_len = prompt_len + response_len + batch_position_ids[i, prompt_start:response_end] = torch.arange( + actual_seq_len, dtype=torch.long, device="cuda" + ) + # Right padding keeps the last position value + if response_len < response_max_len: + batch_position_ids[i, response_end:] = actual_seq_len - 1 + + return batch_input_ids, batch_attention_mask, batch_position_ids + + +def compare_logprobs(logprobs_list, ref_new_token_logprobs_list): + """ + logprobs_list: List[torch.Tensor] - LLM logprob values + ref_new_token_logprobs_list: List[torch.Tensor] - Ref logprobs + + Compares logprobs for each prompt separately. + """ + assert len(logprobs_list) == len(ref_new_token_logprobs_list) + + final_max_diff = float("-inf") + final_min_diff = float("inf") + final_mean_diff = 0.0 + for llm_logprobs_i, ref_logprobs_i in zip(logprobs_list, ref_new_token_logprobs_list): + logprobs_diff = ref_logprobs_i - llm_logprobs_i + max_diff = logprobs_diff.max().item() + min_diff = logprobs_diff.min().item() + mean_diff = logprobs_diff.mean().item() + + final_max_diff = max(final_max_diff, max_diff) + final_min_diff = min(final_min_diff, min_diff) + final_mean_diff += mean_diff + + final_mean_diff = final_mean_diff / len(logprobs_list) + # Given e^(-2.30) ≈ 0.1, the probability ratio should not drop below 0.1x + assert abs(final_min_diff) < 2.30, ( + f"Final Min diff: {final_min_diff:.6f} is below threshold -2.30" + ) + + +@pytest.mark.gpu4 +@pytest.mark.parametrize("model_dir", ["Qwen2-7B-Instruct"]) +@pytest.mark.parametrize("sampler_type", ["TRTLLMSampler"]) +@pytest.mark.parametrize("allreduce_strategy", ["NCCL", "AUTO"]) +def test_accuracy_with_allreduce_strategy(model_dir, sampler_type, allreduce_strategy): + """Test accuracy with different allreduce strategies. + + The default allreduce_strategy (AUTO) produced wrong logprobs with large batch size, + causing VeRL integration to fail to converge. There may be an issue with the + customAllReduce kernels. + + Tracked: NVBug (https://nvbugs/5727691) + + Expected behavior: + - allreduce_strategy="NCCL": Accuracy assertion PASSES + - allreduce_strategy="AUTO": Accuracy assertion FAILS + """ + model_dir = str(llm_models_root() / model_dir) + + os.environ["TOKENIZERS_PARALLELISM"] = "false" + tokenizer = AutoTokenizer.from_pretrained(model_dir) + prompt_text = "The president of the United States is" + prompt = tokenizer.encode(prompt_text, add_special_tokens=False) + del tokenizer + + test_prompts = [prompt] * 256 + + llm_logprobs = [] + llm_responses = [] + try: + kv_cache_config = KvCacheConfig(enable_block_reuse=False, free_gpu_memory_fraction=0.6) + llm = LLM( + model=model_dir, + backend="pytorch", + orchestrator_type="ray", + ray_worker_extension_cls="tensorrt_llm.llmapi.rlhf_utils.WorkerExtension", + kv_cache_config=kv_cache_config, + max_seq_len=2048, + max_batch_size=256, + max_num_tokens=8192, + tensor_parallel_size=4, + sampler_type=sampler_type, + allreduce_strategy=allreduce_strategy, + ) + + sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=1024, logprobs=1) + outputs = llm.generate(test_prompts, sampling_params) + + for output in outputs: + token_ids = output.outputs[0].token_ids + logprobs_list = output.outputs[0].logprobs # list[dict[int, Logprob]] + # Extract logprob values from the list of dicts + logprob_values = [ + logprobs[token_id].logprob for token_id, logprobs in zip(token_ids, logprobs_list) + ] + llm_responses.append(token_ids) + llm_logprobs.append(torch.tensor(logprob_values, dtype=torch.float32, device="cuda")) + finally: + if ray.is_initialized(): + ray.shutdown() + + torch.cuda.empty_cache() + input_ids, attention_mask, position_ids = pad_data(test_prompts, llm_responses) + + # Split data across GPUs + num_gpus = 4 + micro_batch_size = 16 + batch_size = input_ids.shape[0] + samples_per_gpu = (batch_size + num_gpus - 1) // num_gpus + + dp_hf_models = [] + for device_id in range(num_gpus): + hf_model = HFModel(model_dir, device_id) + dp_hf_models.append(hf_model) + + # Split input data and responses into chunks for each GPU + input_ids_chunks = [] + attention_mask_chunks = [] + position_ids_chunks = [] + responses_chunks = [] + + for i in range(num_gpus): + start_idx = i * samples_per_gpu + end_idx = min((i + 1) * samples_per_gpu, batch_size) + + if start_idx < batch_size: + input_ids_chunks.append(input_ids[start_idx:end_idx]) + attention_mask_chunks.append(attention_mask[start_idx:end_idx]) + position_ids_chunks.append(position_ids[start_idx:end_idx]) + responses_chunks.append(llm_responses[start_idx:end_idx]) + + # Process each chunk on its corresponding GPU asynchronously + async def process_all_chunks(hf_models: List[HFModel]): + tasks = [] + for i, (input_chunk, attn_chunk, pos_chunk, resp_chunk) in enumerate( + zip(input_ids_chunks, attention_mask_chunks, position_ids_chunks, responses_chunks) + ): + task = generate_batch_async( + hf_models[i], + input_chunk, + attn_chunk, + pos_chunk, + resp_chunk, + prompt_max_len=1024, + micro_batch_size=micro_batch_size, + ) + tasks.append(task) + return await asyncio.gather(*tasks) + + ref_logprobs_chunks = asyncio.run(process_all_chunks(dp_hf_models)) + + # Move all tensors to cuda:0 and flatten the list + # Each GPU returns a list of logprobs tensors + ref_new_token_logprobs = [] + for i, logprobs_list in enumerate(ref_logprobs_chunks): + for logprobs in logprobs_list: + ref_new_token_logprobs.append(logprobs.to("cuda:0")) + + assert len(ref_new_token_logprobs) == batch_size, ( + f"Count mismatch: got {len(ref_new_token_logprobs)}, expected {batch_size}" + ) + + del dp_hf_models + torch.cuda.empty_cache() + + # Compare LLM logprobs vs HF reference + if allreduce_strategy == "AUTO": + with pytest.raises(AssertionError, match=r"Final Min diff: .* is below threshold -2\.30"): + compare_logprobs(llm_logprobs, ref_new_token_logprobs) + else: + compare_logprobs(llm_logprobs, ref_new_token_logprobs) diff --git a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py index d2738e769a..7a6bf607d8 100644 --- a/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py +++ b/tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py @@ -185,8 +185,7 @@ class AllreducePGTest: return True -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires at least 2 GPUs for this test") +@pytest.mark.gpu2 @pytest.mark.parametrize("hidden_size", [128, 1024], ids=lambda x: f"hidden:{x}") @pytest.mark.parametrize("seq_len", [16, 64], ids=lambda x: f"seqlen:{x}") @@ -253,8 +252,7 @@ def test_allgather_pg_op(seq_len, hidden_size, var_len): assert r is True -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires at least 2 GPUs for this test") +@pytest.mark.gpu2 @pytest.mark.parametrize("hidden_size", [128, 1024], ids=lambda x: f"hidden:{x}") @pytest.mark.parametrize("seq_len", [16, 64], ids=lambda x: f"seqlen:{x}") @@ -329,8 +327,7 @@ def test_reducescatter_pg_op(seq_len, hidden_size, var_len): assert r is True -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires at least 2 GPUs for this test") +@pytest.mark.gpu2 @pytest.mark.parametrize("hidden_size", [128, 1024], ids=lambda x: f"hidden:{x}") @pytest.mark.parametrize("seq_len", [16, 64], ids=lambda x: f"seqlen:{x}") From 85406f9dda0ef8eb942132db8e79e6db30e06ba1 Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Sat, 13 Dec 2025 17:14:43 +0800 Subject: [PATCH 116/172] [https://nvbugs/5720482][fix] Fix test rpc streaming (#9902) Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> --- tensorrt_llm/executor/ipc.py | 72 ++++++++++---- tensorrt_llm/executor/rpc/rpc_common.py | 1 + tensorrt_llm/executor/rpc/rpc_server.py | 110 ++++++++++++---------- tests/integration/test_lists/waives.txt | 1 - tests/unittest/executor/test_rpc.py | 119 +++++++++++++++++++++++- 5 files changed, 232 insertions(+), 71 deletions(-) diff --git a/tensorrt_llm/executor/ipc.py b/tensorrt_llm/executor/ipc.py index 03cb322871..f09dd31dc4 100644 --- a/tensorrt_llm/executor/ipc.py +++ b/tensorrt_llm/executor/ipc.py @@ -59,6 +59,7 @@ class ZeroMqQueue: self._setup_done = False self.name = name self.socket = self.context.socket(socket_type) + self.socket.set_hwm(0) # For ROUTER sockets, track the last identity to enable replies. For now we assume there is only one client in our case. self._last_identity = None @@ -154,14 +155,14 @@ class ZeroMqQueue: else: return False - def put(self, obj: Any): + def put(self, obj: Any, routing_id: Optional[bytes] = None): self.setup_lazily() self._check_thread_safety() with nvtx_range_debug("send", color="blue", category="IPC"): if self.use_hmac_encryption or self.socket_type == zmq.ROUTER: # Need manual serialization for encryption or ROUTER multipart data = self._prepare_data(obj) - self._send_data(data) + self._send_data(data, routing_id=routing_id) else: # Standard socket without encryption - use pyobj directly self.socket.send_pyobj(obj) @@ -197,14 +198,14 @@ class ZeroMqQueue: else: logger.error(f"Failed to send object: {obj}") - async def put_async(self, obj: Any): + async def put_async(self, obj: Any, routing_id: Optional[bytes] = None): self.setup_lazily() self._check_thread_safety() try: if self.use_hmac_encryption or self.socket_type == zmq.ROUTER: # Need manual serialization for encryption or ROUTER multipart data = self._prepare_data(obj) - await self._send_data_async(data) + await self._send_data_async(data, routing_id=routing_id) else: # Standard socket without encryption await self.socket.send_pyobj(obj) @@ -243,7 +244,9 @@ class ZeroMqQueue: self._check_thread_safety() return await self._recv_data_async() - async def get_async_noblock(self, timeout: float = 0.5) -> Any: + async def get_async_noblock(self, + timeout: float = 0.5, + return_identity: bool = False) -> Any: """Get data with timeout using polling to avoid message drops. This method uses ZMQ's NOBLOCK flag with polling instead of asyncio.wait_for @@ -251,9 +254,10 @@ class ZeroMqQueue: Args: timeout: Timeout in seconds + return_identity: Whether to return the identity of the sender (for ROUTER sockets) Returns: - The received object + The received object, or (object, identity) if return_identity is True Raises: asyncio.TimeoutError: If timeout is reached without receiving data @@ -271,13 +275,22 @@ class ZeroMqQueue: identity, data = await self.socket.recv_multipart( flags=zmq.NOBLOCK) self._last_identity = identity - return self._parse_data(data) + obj = self._parse_data(data) + if return_identity: + return obj, identity + else: + return obj else: if self.use_hmac_encryption: data = await self.socket.recv(flags=zmq.NOBLOCK) - return self._parse_data(data) + obj = self._parse_data(data) else: - return await self.socket.recv_pyobj(flags=zmq.NOBLOCK) + obj = await self.socket.recv_pyobj(flags=zmq.NOBLOCK) + + if return_identity: + return obj, None + else: + return obj except zmq.Again: # No message available yet if asyncio.get_event_loop().time() >= deadline: @@ -329,30 +342,39 @@ class ZeroMqQueue: else: return pickle.loads(data) # nosec B301 - def _send_data(self, data: bytes, flags: int = 0): + def _send_data(self, + data: bytes, + flags: int = 0, + routing_id: Optional[bytes] = None): """Send data using appropriate API based on socket type.""" if self.socket_type == zmq.ROUTER: - if self._last_identity is None: + identity = routing_id if routing_id is not None else self._last_identity + if identity is None: raise ValueError("ROUTER socket requires identity") - self.socket.send_multipart([self._last_identity, data], flags=flags) + self.socket.send_multipart([identity, data], flags=flags) else: self.socket.send(data, flags=flags) - async def _send_data_async(self, data: bytes): + async def _send_data_async(self, + data: bytes, + routing_id: Optional[bytes] = None): """Async version of _send_data.""" if self.socket_type == zmq.ROUTER: - if self._last_identity is None: + identity = routing_id if routing_id is not None else self._last_identity + if identity is None: raise ValueError("ROUTER socket requires identity") - await self.socket.send_multipart([self._last_identity, data]) + await self.socket.send_multipart([identity, data]) else: await self.socket.send(data) - def _recv_data(self) -> Any: + def _recv_data(self, return_identity: bool = False) -> Any: """Receive data using appropriate API based on socket type.""" if self.socket_type == zmq.ROUTER: identity, data = self.socket.recv_multipart() self._last_identity = identity # Store for replies obj = self._parse_data(data) + if return_identity: + return obj, identity return obj else: if self.use_hmac_encryption: @@ -360,20 +382,30 @@ class ZeroMqQueue: obj = self._parse_data(data) else: obj = self.socket.recv_pyobj() + + if return_identity: + return obj, None return obj - async def _recv_data_async(self) -> Any: + async def _recv_data_async(self, return_identity: bool = False) -> Any: """Async version of _recv_data.""" if self.socket_type == zmq.ROUTER: identity, data = await self.socket.recv_multipart() self._last_identity = identity # Store for replies - return self._parse_data(data) + obj = self._parse_data(data) + if return_identity: + return obj, identity + return obj else: if self.use_hmac_encryption: data = await self.socket.recv() - return self._parse_data(data) + obj = self._parse_data(data) else: - return await self.socket.recv_pyobj() + obj = await self.socket.recv_pyobj() + + if return_identity: + return obj, None + return obj def notify_with_retry(self, message, max_retries=5, timeout=1): """ diff --git a/tensorrt_llm/executor/rpc/rpc_common.py b/tensorrt_llm/executor/rpc/rpc_common.py index 6c588c300c..a057a07149 100644 --- a/tensorrt_llm/executor/rpc/rpc_common.py +++ b/tensorrt_llm/executor/rpc/rpc_common.py @@ -75,6 +75,7 @@ class RPCRequest: is_streaming: bool = False creation_timestamp: Optional[ float] = None # Unix timestamp when request was created + routing_id: Optional[bytes] = None def __post_init__(self): """Initialize creation_timestamp if not provided.""" diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py index 00fb23e94d..6b598b98ea 100644 --- a/tensorrt_llm/executor/rpc/rpc_server.py +++ b/tensorrt_llm/executor/rpc/rpc_server.py @@ -228,8 +228,10 @@ class RPCServer: while asyncio.get_event_loop().time() < end_time: try: - req: RPCRequest = await asyncio.wait_for( - self._client_socket.get_async_noblock(), timeout=2) + req, routing_id = await asyncio.wait_for( + self._client_socket.get_async_noblock(return_identity=True), + timeout=2) + req.routing_id = routing_id drained_count += 1 logger_debug(f"[server] Draining request after shutdown: {req}") @@ -299,13 +301,16 @@ class RPCServer: error=error, is_streaming= True, # Important: mark as streaming so it gets routed correctly - stream_status='error')) + stream_status='error'), + routing_id=req.routing_id) logger_debug( f"[server] Sent error response for request {req.request_id}", color="green") else: - await self._client_socket.put_async( - RPCResponse(req.request_id, result=None, error=error)) + await self._client_socket.put_async(RPCResponse(req.request_id, + result=None, + error=error), + routing_id=req.routing_id) logger_debug( f"[server] Sent error response for request {req.request_id}", color="green") @@ -335,8 +340,10 @@ class RPCServer: try: #logger_debug(f"[server] Worker waiting for request", color="green") # Read request directly from socket with timeout - req: RPCRequest = await asyncio.wait_for( - self._client_socket.get_async_noblock(), timeout=2) + req, routing_id = await asyncio.wait_for( + self._client_socket.get_async_noblock(return_identity=True), + timeout=2) + req.routing_id = routing_id logger_debug(f"[server] Worker got request: {req}", color="green") except asyncio.TimeoutError: @@ -492,15 +499,15 @@ class RPCServer: func = self._functions[req.method_name] if not inspect.isasyncgenfunction(func): - await self._client_socket.put_async( - RPCResponse( - req.request_id, - result=None, - error=RPCStreamingError( - f"Method '{req.method_name}' is not an async generator.", - traceback=traceback.format_exc()), - is_streaming=True, - stream_status='error')) + await self._client_socket.put_async(RPCResponse( + req.request_id, + result=None, + error=RPCStreamingError( + f"Method '{req.method_name}' is not an async generator.", + traceback=traceback.format_exc()), + is_streaming=True, + stream_status='error'), + routing_id=req.routing_id) return chunk_index = 0 @@ -512,13 +519,14 @@ class RPCServer: logger_debug( f"[server] RPC Server running streaming task {req.method_name}") # Send start signal - await self._client_socket.put_async( - RPCResponse(req.request_id, - result=None, - error=None, - is_streaming=True, - chunk_index=chunk_index, - stream_status='start')) + await self._client_socket.put_async(RPCResponse( + req.request_id, + result=None, + error=None, + is_streaming=True, + chunk_index=chunk_index, + stream_status='start'), + routing_id=req.routing_id) logger_debug( f"[server] Sent start signal for request {req.request_id}", color="green") @@ -584,39 +592,41 @@ class RPCServer: chunk_index += 1 # Send end signal - await self._client_socket.put_async( - RPCResponse(req.request_id, - result=None, - error=None, - is_streaming=True, - chunk_index=chunk_index, - stream_status='end')) + await self._client_socket.put_async(RPCResponse( + req.request_id, + result=None, + error=None, + is_streaming=True, + chunk_index=chunk_index, + stream_status='end'), + routing_id=req.routing_id) logger_debug( f"[server] Sent end signal for request {req.request_id}", color="green") except RPCCancelled as e: # Server is shutting down, send cancelled error - await self._client_socket.put_async( - RPCResponse(req.request_id, - result=None, - error=e, - is_streaming=True, - chunk_index=chunk_index, - stream_status='error')) + await self._client_socket.put_async(RPCResponse( + req.request_id, + result=None, + error=e, + is_streaming=True, + chunk_index=chunk_index, + stream_status='error'), + routing_id=req.routing_id) logger_debug( f"[server] Sent error signal for request {req.request_id}", color="green") except asyncio.TimeoutError: - await self._client_socket.put_async( - RPCResponse( - req.request_id, - result=None, - error=RPCTimeout( - f"Streaming method '{req.method_name}' timed out", - traceback=traceback.format_exc()), - is_streaming=True, - chunk_index=chunk_index, - stream_status='error')) + await self._client_socket.put_async(RPCResponse( + req.request_id, + result=None, + error=RPCTimeout( + f"Streaming method '{req.method_name}' timed out", + traceback=traceback.format_exc()), + is_streaming=True, + chunk_index=chunk_index, + stream_status='error'), + routing_id=req.routing_id) except Exception as e: response = RPCResponse( @@ -633,7 +643,8 @@ class RPCServer: response: RPCResponse) -> bool: """Safely sends a response, handling pickle errors.""" try: - await self._client_socket.put_async(response) + await self._client_socket.put_async(response, + routing_id=req.routing_id) logger_debug(f"[server] Sent response for request {req.request_id}", color="green") return True @@ -661,7 +672,8 @@ class RPCServer: traceback=traceback.format_exc())) try: - await self._client_socket.put_async(error_response) + await self._client_socket.put_async(error_response, + routing_id=req.routing_id) logger_debug( f"[server] Sent error response for request {req.request_id}", color="green") diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index f6c76b23d3..08c35dcf4f 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -422,6 +422,5 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) -unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_streaming SKIP (https://nvbugs/5720482) unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475) diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py index 7b3f2814dd..d0a0fb23bd 100644 --- a/tests/unittest/executor/test_rpc.py +++ b/tests/unittest/executor/test_rpc.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import threading import time @@ -200,7 +201,9 @@ class TestRpcCorrectness: ) == no + 1, f"result {future.result()} != {no + 1}" def test_incremental_task_streaming(self): - with RpcServerWrapper(TestRpcCorrectness.App()) as server: + with RpcServerWrapper(TestRpcCorrectness.App(), + async_run_task=True) as server: + with RPCClient(server.addr) as client: async def test_streaming_task(): @@ -218,6 +221,30 @@ class TestRpcCorrectness: asyncio.run(test_streaming_task()) + def test_multi_client_to_single_server(self): + """Test that multiple RPC clients can concurrently connect to a single RPC server and execute tasks.""" + + class App: + + def echo(self, msg: str) -> str: + return msg + + with RpcServerWrapper(App()) as server: + # Create multiple clients + num_clients = 10 + clients = [RPCClient(server.addr) for _ in range(num_clients)] + + try: + # Perform requests from all clients + for i, client in enumerate(clients): + msg = f"hello from client {i}" + ret = client.echo(msg).remote() + assert ret == msg, f"Client {i} failed: expected '{msg}', got '{ret}'" + finally: + # Clean up clients + for client in clients: + client.close() + class TestRpcError: @@ -1006,3 +1033,93 @@ class TestRpcRobustness: f"Iteration {i}/{num_calls} completed successfully") print(f"All {num_calls} iterations completed successfully") + + @pytest.mark.parametrize("concurrency", [10, 50, 100]) + def test_many_client_to_single_server(self, concurrency): + """ + Pressure test where many clients connect to a single server. + Controls concurrency via parameter and ensures each client performs multiple operations. + """ + + class App: + + def echo(self, msg: str) -> str: + return msg + + total_clients = max(200, concurrency * 2) + requests_per_client = 100 + + with RpcServerWrapper(App(), async_run_task=True) as server: + errors = [] + + def run_client_session(client_id): + try: + with RPCClient(server.addr) as client: + for i in range(requests_per_client): + msg = f"c{client_id}-req{i}" + ret = client.echo(msg).remote() + assert ret == msg + except Exception as e: + errors.append(f"Client {client_id} error: {e}") + raise + + with concurrent.futures.ThreadPoolExecutor( + max_workers=concurrency) as executor: + futures = [ + executor.submit(run_client_session, i) + for i in range(total_clients) + ] + concurrent.futures.wait(futures) + + # Check for exceptions in futures + for f in futures: + if f.exception(): + errors.append(str(f.exception())) + + assert not errors, f"Encountered errors: {errors[:5]}..." + + @pytest.mark.parametrize("concurrency", [10, 50, 100]) + def test_many_client_to_single_server_threaded(self, concurrency): + """ + Pressure test where clients are created and used in different threads. + """ + import concurrent.futures + + class App: + + def echo(self, msg: str) -> str: + return msg + + # Scale total clients to be more than concurrency to force queueing/reuse + total_clients = max(200, concurrency * 2) + requests_per_client = 100 + + with RpcServerWrapper(App(), async_run_task=True) as server: + errors = [] + + def run_client_session(client_id): + try: + # Client creation and usage happens strictly within this thread + with RPCClient(server.addr) as client: + for i in range(requests_per_client): + msg = f"c{client_id}-req{i}" + ret = client.echo(msg).remote() + assert ret == msg + except Exception as e: + errors.append(f"Client {client_id} error: {e}") + raise + + # Use ThreadPoolExecutor to simulate concurrent threads + with concurrent.futures.ThreadPoolExecutor( + max_workers=concurrency) as executor: + futures = [ + executor.submit(run_client_session, i) + for i in range(total_clients) + ] + concurrent.futures.wait(futures) + + for f in futures: + if f.exception(): + errors.append(str(f.exception())) + + assert not errors, f"Encountered errors: {errors[:5]}..." From 079ef8ae77ae558f307bc51284b06abfcc506a55 Mon Sep 17 00:00:00 2001 From: jellysnack <158609015+jellysnack@users.noreply.github.com> Date: Sat, 13 Dec 2025 14:57:59 +0300 Subject: [PATCH 117/172] [None][feat] Graceful Error Handling for Guided Decoder (#9078) Signed-off-by: jellysnack Signed-off-by: jellysnack <158609015+jellysnack@users.noreply.github.com> Co-authored-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- .../_torch/pyexecutor/guided_decoder.py | 139 ++++++++++-------- tensorrt_llm/_torch/pyexecutor/py_executor.py | 53 ++++++- 2 files changed, 128 insertions(+), 64 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py index efd3379ee0..0d40951604 100644 --- a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py +++ b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py @@ -204,7 +204,7 @@ class GuidedDecoder: def bitmask_size(self) -> int: return math.ceil(self.vocab_size_padded / 32) - def _build(self, requests: GuidedRequests) -> None: + def _build(self, requests: GuidedRequests) -> List[Tuple[int, str]]: """Build the bitmask for requests with guided decoding enabled. Specifically, this method: @@ -212,65 +212,76 @@ class GuidedDecoder: - call the grammar matcher to fill the bitmask on CPU; - asynchronously copy the bitmask to GPU. """ + failed_requests = [] self.token_mask_host[:requests.num_bitmask_tokens].fill_(0) for req, offset in requests.valid_requests_with_offsets(): slot = req.seq_slot - self.num_advanced_tokens[slot] = 0 - self.num_guided_tokens[slot] = 0 + try: + self.num_advanced_tokens[slot] = 0 + self.num_guided_tokens[slot] = 0 - matcher_init: bool = req.require_matcher_init() - matcher_advance: bool = req.require_matcher_advance() - if not (matcher_init or matcher_advance): - continue - - if matcher_init: - matcher = self.grammar_matcher_factory.create( - req.guided_decoding_params) - self.grammar_matchers[slot] = matcher - - if matcher_advance: - matcher = self.grammar_matchers[slot] - # The last new token must be acceptable unless the matcher is terminated: - # 1. For the main model loop, when overlap scheduler is enabled, the matcher may have accepted the EOS token in the draft tokens at the previous iteration. - # 2. For the draft model loop, the matcher may have accepted the EOS token at the previous drafting iteration. - if matcher.is_terminated() or self.is_draft_terminated[slot]: + matcher_init: bool = req.require_matcher_init() + matcher_advance: bool = req.require_matcher_advance() + if not (matcher_init or matcher_advance): continue - accepted = matcher.accept_token(req.new_token) - if not accepted: - if req.is_draft: - self.is_draft_terminated[slot] = True - logger.debug( - f"Draft request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}." - ) + + if matcher_init: + matcher = self.grammar_matcher_factory.create( + req.guided_decoding_params) + self.grammar_matchers[slot] = matcher + + if matcher_advance: + matcher = self.grammar_matchers[slot] + # The last new token must be acceptable unless the matcher is terminated or None: + # 1. For the main model loop, when overlap scheduler is enabled, the matcher may have accepted the EOS token in the draft tokens at the previous iteration. + # 2. For the draft model loop, the matcher may have accepted the EOS token at the previous drafting iteration. + # 3. The matcher can be None if there was an error during its creation. + if matcher is None or matcher.is_terminated( + ) or self.is_draft_terminated[slot]: continue - # TODO: Make this an error response. - raise ValueError( - f"Request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}." - ) - - self.num_advanced_tokens[slot] += 1 - if not matcher.is_terminated(): - matcher.fill_next_token_bitmask(self.bitmask_host, offset) - self.token_mask_host[offset] = 1 - self.num_guided_tokens[slot] += 1 - # Process draft tokens - for i, tid in enumerate(req.draft_tokens, 1): - accepted = matcher.accept_token(tid) + accepted = matcher.accept_token(req.new_token) if not accepted: - break - self.num_advanced_tokens[slot] += 1 - if matcher.is_terminated(): - break - matcher.fill_next_token_bitmask(self.bitmask_host, - offset + i) - self.token_mask_host[offset + i] = 1 - self.num_guided_tokens[slot] += 1 + if req.is_draft: + self.is_draft_terminated[slot] = True + logger.debug( + f"Draft request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}." + ) + continue + raise ValueError( + f"Request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}." + ) - if req.is_draft: - assert len(req.draft_tokens) == 0 - self.num_advanced_draft_tokens[ - slot] += self.num_advanced_tokens[slot] + self.num_advanced_tokens[slot] += 1 + if not matcher.is_terminated(): + matcher.fill_next_token_bitmask(self.bitmask_host, offset) + self.token_mask_host[offset] = 1 + self.num_guided_tokens[slot] += 1 + # Process draft tokens + for i, tid in enumerate(req.draft_tokens, 1): + accepted = matcher.accept_token(tid) + if not accepted: + break + self.num_advanced_tokens[slot] += 1 + if matcher.is_terminated(): + break + matcher.fill_next_token_bitmask(self.bitmask_host, + offset + i) + self.token_mask_host[offset + i] = 1 + self.num_guided_tokens[slot] += 1 + + if req.is_draft: + assert len(req.draft_tokens) == 0 + self.num_advanced_draft_tokens[ + slot] += self.num_advanced_tokens[slot] + except Exception as e: + error_msg = f"Guided decoding error: {str(e)}" + failed_requests.append((req.request_id, error_msg)) + logger.error( + f"Request {req.request_id} at slot {slot} failed during guided decoding: {error_msg}" + ) + + return failed_requests def _copy_bitmask(self, requests: GuidedRequests, @@ -306,8 +317,8 @@ class GuidedDecoder: scheduled_requests, self.max_num_draft_tokens) @nvtx_range("GuideDecoder.build") - def build(self) -> None: - self._build(self.requests) + def build(self) -> List[Tuple[int, str]]: + return self._build(self.requests) @nvtx_range("GuideDecoder.copy_bitmask") def copy_bitmask(self, num_bitmask_tokens: Optional[int] = None) -> None: @@ -325,8 +336,8 @@ class GuidedDecoder: def execute(self, logits: torch.Tensor, - d2t: Optional[torch.Tensor] = None) -> None: - self.build() + d2t: Optional[torch.Tensor] = None) -> List[Tuple[int, str]]: + failed_requests = self.build() with torch.cuda.stream(self.stream): torch.cuda.current_stream().wait_event(self.token_event) @@ -337,6 +348,8 @@ class GuidedDecoder: self.apply_bitmask(logits, d2t=d2t) self.token_event.record() + return failed_requests + def _rollback_rejected_tokens(self, requests: GuidedRequests) -> None: """Rollback the grammar matcher for rejected tokens. @@ -460,23 +473,25 @@ class CapturableGuidedDecoder(GuidedDecoder): ) @hostfunc - def build(self) -> None: - self._build(self.requests_hostfunc) + def build(self) -> List[Tuple[int, str]]: + return self._build(self.requests_hostfunc) def execute(self, logits: torch.Tensor, - d2t: Optional[torch.Tensor] = None) -> None: + d2t: Optional[torch.Tensor] = None) -> List[Tuple[int, str]]: with torch.cuda.stream(self.stream): torch.cuda.current_stream().wait_event(self.token_event) self.fetch_batch() self.init_disagg_gen_requests() - self.build() + failed_requests = self.build() self.copy_bitmask() self.bitmask_event.record() torch.cuda.current_stream().wait_event(self.bitmask_event) self.apply_bitmask(logits, d2t=d2t) + return failed_requests + @hostfunc def rollback_rejected_tokens(self) -> None: self._rollback_rejected_tokens(self.requests_hostfunc) @@ -532,13 +547,13 @@ class CapturableGuidedDecoder(GuidedDecoder): def execute_draft_batch(self, logits: torch.Tensor, d2t: Optional[torch.Tensor] = None, - draft_step: int = 0) -> None: + draft_step: int = 0) -> List[Tuple[int, str]]: with torch.cuda.stream(self.stream): torch.cuda.current_stream().wait_event(self.token_event) self.fetch_draft_batch(draft_step=draft_step) if draft_step == 0: self.rollback_rejected_tokens() - self.build() + failed_requests = self.build() if draft_step == self.max_num_draft_tokens - 1: self.rollback_draft_tokens() # Overwrite num_bitmask_tokens since the request might not be updated on CUDA stream yet. @@ -550,3 +565,5 @@ class CapturableGuidedDecoder(GuidedDecoder): self.apply_bitmask(logits, d2t=d2t, num_bitmask_tokens=len(self.requests)) + + return failed_requests diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 5459c62559..4f8bc8820e 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -984,14 +984,22 @@ class PyExecutor: batch_outputs = self._forward_step(scheduled_batch) + guided_decoder_failed_requests = None if self.guided_decoder is not None: self.guided_decoder.add_batch(scheduled_batch) - self.guided_decoder.execute( + guided_decoder_failed_requests = self.guided_decoder.execute( batch_outputs['logits']) sample_state = self._sample_async( scheduled_batch, batch_outputs) assert sample_state is not None, "Sampling failed" + + # Handle guided decoder errors after _sample_async to avoid state conflicts. + # If called before, failed requests would be marked as GENERATION_COMPLETE, + # causing _sample_async to fail when accessing context_chunk_size property. + self._handle_guided_decoder_errors( + scheduled_batch, guided_decoder_failed_requests) + self._update_request_states(scheduled_batch) if self.enable_iter_perf_stats: @@ -1306,11 +1314,21 @@ class PyExecutor: self.guided_decoder.rollback_draft_tokens() batch_outputs = self._forward_step(scheduled_batch) + + guided_decoder_failed_requests = None if self.guided_decoder is not None: - self.guided_decoder.execute(batch_outputs['logits']) + guided_decoder_failed_requests = self.guided_decoder.execute( + batch_outputs['logits']) sample_state = self._sample_async(scheduled_batch, batch_outputs) + + # Handle guided decoder errors after _sample_async to avoid state conflicts. + # If called before, failed requests would be marked as GENERATION_COMPLETE, + # causing _sample_async to fail when accessing context_chunk_size property. + self._handle_guided_decoder_errors( + scheduled_batch, guided_decoder_failed_requests) + if self.drafter is not None: self.drafter.run_drafter_post(scheduled_batch, self.resource_manager, @@ -1562,15 +1580,23 @@ class PyExecutor: self.drafter.cleanup_previous_draft_resources() if can_queue: + guided_decoder_failed_requests = None if self.guided_decoder is not None: # add_batch must be called again to have updated new tokens. self.guided_decoder.add_batch(scheduled_batch) - self.guided_decoder.execute(batch_outputs['logits']) + guided_decoder_failed_requests = self.guided_decoder.execute( + batch_outputs['logits']) sample_state = self._sample_async(scheduled_batch, batch_outputs) assert sample_state is not None, "Sampling failed" + # Handle guided decoder errors after _sample_async to avoid state conflicts. + # If called before, failed requests would be marked as GENERATION_COMPLETE, + # causing _sample_async to fail when accessing context_chunk_size property. + self._handle_guided_decoder_errors( + scheduled_batch, guided_decoder_failed_requests) + self._update_request_states(scheduled_batch) ctx_transmission_reqs = self._send_disagg_ctx_cache( @@ -2694,6 +2720,27 @@ class PyExecutor: def reset_prefix_cache(self): self.kv_cache_manager.reset_reuse_state() + def _handle_guided_decoder_errors( + self, scheduled_batch: ScheduledRequests, + failed_requests: Optional[List[Tuple[int, str]]]): + """Handle errors that occurred during guided decoding. + + Args: + scheduled_batch: The current batch of scheduled requests + failed_requests: List of (request_id, error_message) tuples for failed requests, + or None if no failures occurred + """ + if not failed_requests: + return + + failed_req_id_to_err = {req_id: err for req_id, err in failed_requests} + + for request in scheduled_batch.all_requests(): + if request.py_request_id not in failed_req_id_to_err: + continue + error_msg = failed_req_id_to_err[request.py_request_id] + self._handle_errors(error_msg, requests=[request]) + class DisaggPPTerminationHandler: """Handles termination synchronization across pipeline parallel ranks under disaggregated serving. From 383b13e0e5d7d11977bca6ec8c68515572d6033c Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Sat, 13 Dec 2025 10:38:22 -0500 Subject: [PATCH 118/172] [None][feat] Implement sampling on 1-model EAGLE3 (#9885) Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- examples/llm-api/quickstart_advanced.py | 7 +- .../_torch/pyexecutor/model_engine.py | 6 +- .../_torch/pyexecutor/py_executor_creator.py | 11 +++ tensorrt_llm/_torch/speculative/eagle3.py | 43 ++++++++- tensorrt_llm/_torch/speculative/interface.py | 87 ++++++++++++++++++ .../_torch/speculative/one_model_sampler.py | 91 +++++++++++++++++++ tensorrt_llm/_torch/speculative/utils.py | 1 + tensorrt_llm/llmapi/llm_args.py | 4 + .../defs/accuracy/test_llm_api_pytorch.py | 3 +- 9 files changed, 248 insertions(+), 5 deletions(-) create mode 100644 tensorrt_llm/_torch/speculative/one_model_sampler.py diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py index 5aa7f7ce70..f028d41e55 100644 --- a/examples/llm-api/quickstart_advanced.py +++ b/examples/llm-api/quickstart_advanced.py @@ -143,6 +143,9 @@ def add_llm_args(parser): default=False, action='store_true') parser.add_argument('--dynamic_tree_max_topK', type=int, default=None) + parser.add_argument('--allow_advanced_sampling', + default=False, + action='store_true') # Relaxed acceptance parser.add_argument('--use_relaxed_acceptance_for_thinking', @@ -210,7 +213,9 @@ def setup_llm(args, **kwargs): eagle3_one_model=args.use_one_model, eagle_choices=args.eagle_choices, use_dynamic_tree=args.use_dynamic_tree, - dynamic_tree_max_topK=args.dynamic_tree_max_topK) + dynamic_tree_max_topK=args.dynamic_tree_max_topK, + allow_advanced_sampling=args.allow_advanced_sampling) + elif spec_decode_algo == "DRAFT_TARGET": spec_config = DraftTargetDecodingConfig( max_draft_len=args.spec_decode_max_draft_len, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index be6ae4bf3c..5da64a5569 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -48,7 +48,8 @@ from ..speculative import (SpecMetadata, get_num_extra_kv_tokens, get_spec_metadata, update_spec_config_from_model_config) from ..speculative.drafting_loops import BaseDraftingLoopWrapper -from ..speculative.eagle3 import Eagle3ResourceManager, Eagle3SpecMetadata +from ..speculative.eagle3 import (Eagle3OneModelSpecMetadata, + Eagle3ResourceManager, Eagle3SpecMetadata) from ..speculative.mtp import SampleStateTensorsMTP from ..speculative.utils import SpecDecodingTensor from ..utils import (get_model_extra_attrs, @@ -2093,6 +2094,9 @@ class PyTorchModelEngine(ModelEngine): num_accepted_draft_tokens)] if isinstance(spec_metadata, Eagle3SpecMetadata): spec_metadata.request_accepted_path = request_accepted_path + if isinstance(spec_metadata, Eagle3OneModelSpecMetadata): + spec_metadata.populate_sampling_params_for_one_model( + scheduled_requests.all_requests()) spec_metadata.prepare() inputs['spec_metadata'] = spec_metadata diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 3fc0027d63..a908ba251f 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -281,6 +281,17 @@ def create_py_executor( ) llm_args.disable_overlap_scheduler = True + if spec_config is not None and spec_config.spec_dec_mode.use_one_engine(): + if not spec_config.allow_advanced_sampling: + logger.warning( + f"Falling back to greedy decoding for {spec_config.decoding_type}. If you " + "want to use non-greedy sampling, please set allow_advanced_sampling=True." + ) + elif spec_config.spec_dec_mode.is_mtp_one_model(): + logger.warning( + "Advanced sampling is not supported for MTP yet - this will be added soon." + ) + if mm_encoder_only: llm_args.mm_encoder_only = True llm_args.disable_overlap_scheduler = True diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py index 89b1ff0ff1..18052f617c 100644 --- a/tensorrt_llm/_torch/speculative/eagle3.py +++ b/tensorrt_llm/_torch/speculative/eagle3.py @@ -14,6 +14,7 @@ from ..pyexecutor.sampler import TorchSampler from ..pyexecutor.scheduler import ScheduledRequests from .interface import SpecMetadata, get_force_num_accepted_tokens from .mtp import MTPSampler +from .one_model_sampler import sampling_batch_spec_dec_one_model from .spec_tree_manager import SpecTreeManager if TYPE_CHECKING: @@ -493,6 +494,40 @@ class Eagle3OneModelWorker(nn.Module): 'next_new_tokens': next_new_tokens, } + def _sample_tokens_for_batch( + self, + logits: torch.Tensor, + spec_metadata: Eagle3OneModelSpecMetadata, + num_contexts: int, + batch_size: int, + ) -> torch.Tensor: + """ + Sample tokens from logits using per-request sampling parameters. + Supports both greedy and non-greedy sampling. + + Args: + logits: [num_tokens, vocab_size] - Logits to sample from + spec_metadata: Metadata containing sampling parameters + batch_size: Number of requests in the batch + + Returns: + sampled_tokens: [num_tokens] - Sampled token ids + """ + if spec_metadata.allow_advanced_sampling: + num_gens = batch_size - num_contexts + num_tokens = num_contexts + num_gens * (self.max_draft_len + 1) + + temperatures = spec_metadata.temperatures[:num_tokens] + top_ks = spec_metadata.top_ks[:num_tokens] + top_ps = spec_metadata.top_ps[:num_tokens] + + sampled_tokens = sampling_batch_spec_dec_one_model( + logits, temperatures, top_ks, top_ps) + else: + sampled_tokens = torch.argmax(logits, dim=-1) + + return sampled_tokens + def sample_and_accept_draft_tokens( self, logits: torch.Tensor, @@ -514,8 +549,9 @@ class Eagle3OneModelWorker(nn.Module): dtype=torch.int, device=logits.device) - # Do greedy sampling for the input logits - target_tokens = torch.argmax(logits, dim=-1) + # Sample tokens using per-request sampling parameters + target_tokens = self._sample_tokens_for_batch(logits, spec_metadata, + num_contexts, batch_size) # context accepted_tokens[:num_contexts, 0] = target_tokens[:num_contexts] @@ -557,6 +593,9 @@ class Eagle3OneModelWorker(nn.Module): Draft token ids. Flattened. ''' + # Note: using greedy for draft tokens is a bit easier to implement and + # faster. It doesn't affect the final output and seems to have a negligible + # impact on AR. draft_tokens = torch.argmax(logits, dim=-1) # Apply d2t (offsets between draft model dictionary and main model dictionary). diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index a02640f420..9bf262b3cb 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -229,6 +229,13 @@ class SpecMetadata: # whether the spec-dec mode is a dynamic tree. is_spec_dec_dynamic_tree: bool = False + # For non-greedy sampling on 1-model. + allow_advanced_sampling: bool = False + # Sampling parameters for non-greedy sampling (per-request) + temperatures: Optional[torch.Tensor] = None + top_ks: Optional[torch.Tensor] = None + top_ps: Optional[torch.Tensor] = None + def __post_init__(self): pass @@ -264,3 +271,83 @@ class SpecMetadata: Some spec decode algorithms require hidden states from the target model. Use this method to record them. By default, does nothing. """ + + def populate_sampling_params_for_one_model( + self, requests: list["LlmRequest"]) -> None: + """ + Set up topp/topk/temperatures for 1-model sampler. + """ + from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState + from tensorrt_llm.sampling_params import SamplingParams + + if not self.allow_advanced_sampling or not self.spec_dec_mode.use_one_engine( + ): + return + + if self.temperatures is None: + # Ensures determinism across ranks. + torch.manual_seed(0) + + temperatures = [] + top_ks = [] + top_ps = [] + + # Need to use a very small value for temperature when disabled to avoid division by 0 + DISABLE_TEMP_VAL = 1e-5 + # Very large values disable topk. + DISABLE_TOPK_VAL = torch.iinfo(torch.int32).max + DISABLE_TOPP_VAL = 1.0 + + for request in requests: + sampling_config = request.sampling_config + temp = sampling_config.temperature + temp_val = temp[0] if temp is not None and len(temp) > 0 else None + + tk = sampling_config.top_k + tk_val = tk[0] if tk is not None and len(tk) > 0 else None + + tp = sampling_config.top_p + tp_val = tp[0] if tp is not None and len(tp) > 0 else None + + # Context requests have no draft tokens yet. + num_tokens = 1 + self.max_draft_len if request.state == LlmRequestState.GENERATION_IN_PROGRESS else 1 + + is_greedy = SamplingParams.params_imply_greedy_decoding( + temperature=temp_val, + top_k=tk_val, + top_p=tp_val, + use_beam_search=False) + + temp_val = DISABLE_TEMP_VAL if is_greedy or temp_val is None or temp_val == 0 else temp_val + tk_val = DISABLE_TOPK_VAL if is_greedy or tk_val is None or tk_val <= 0 else tk_val + tp_val = DISABLE_TOPP_VAL if is_greedy or tp_val is None else tp_val + + temperatures.extend(temp_val for _ in range(num_tokens)) + top_ks.extend(tk_val for _ in range(num_tokens)) + top_ps.extend(tp_val for _ in range(num_tokens)) + + if self.temperatures is None: + self.temperatures = torch.ones( + (self.max_draft_len + 1) * self.max_num_requests, + dtype=torch.float32, + device='cuda') + self.top_ks = torch.zeros( + (self.max_draft_len + 1) * self.max_num_requests, + dtype=torch.int32, + device='cuda') + self.top_ps = torch.ones( + (self.max_draft_len + 1) * self.max_num_requests, + dtype=torch.float32, + device='cuda') + + self.temperatures[:len(temperatures)].copy_(torch.tensor( + temperatures, dtype=torch.float32, pin_memory=True), + non_blocking=True) + self.top_ks[:len(top_ks)].copy_(torch.tensor(top_ks, + dtype=torch.int32, + pin_memory=True), + non_blocking=True) + self.top_ps[:len(top_ps)].copy_(torch.tensor(top_ps, + dtype=torch.float32, + pin_memory=True), + non_blocking=True) diff --git a/tensorrt_llm/_torch/speculative/one_model_sampler.py b/tensorrt_llm/_torch/speculative/one_model_sampler.py new file mode 100644 index 0000000000..ca48c03f28 --- /dev/null +++ b/tensorrt_llm/_torch/speculative/one_model_sampler.py @@ -0,0 +1,91 @@ +from typing import Optional + +import torch + + +def forward_native( + logits: torch.Tensor, + k: Optional[torch.Tensor], + p: Optional[torch.Tensor], +) -> torch.Tensor: + """ + PyTorch-native implementation of top-k and top-p sampling. + + The logits tensor may be updated in-place. + """ + logits = apply_top_k_top_p(logits, k, p) + probs = logits.softmax(dim=-1, dtype=torch.float32) + return random_sample(probs) + + +def random_sample( + probs: torch.Tensor, +) -> torch.Tensor: + """Randomly sample from the probabilities. + + We use this function instead of torch.multinomial because torch.multinomial + causes CPU-GPU synchronization. + """ + q = torch.empty_like(probs).exponential_() + return probs.div_(q).argmax(dim=-1).view(-1) + + +def apply_top_k_top_p( + logits: torch.Tensor, + k: Optional[torch.Tensor], + p: Optional[torch.Tensor], +) -> torch.Tensor: + """Apply top-k and top-p masks to the logits. + + If a top-p is used, this function will sort the logits tensor, + which can be slow for large batches. + + The logits tensor may be updated in-place. + """ + logits_sort, logits_idx = logits.sort(dim=-1, descending=False) + if k is not None: + # Apply top-k. + top_k_mask = logits_sort.size(1) - k.to(torch.long) # shape: B + top_k_mask = top_k_mask.clamp(min=0) + # Get all the top_k values. + top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) + top_k_mask = logits_sort < top_k_mask + logits_sort.masked_fill_(top_k_mask, -float("inf")) + + if p is not None: + # Apply top-p. + probs_sort = logits_sort.softmax(dim=-1) + probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort) + top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) + # at least one + top_p_mask[:, -1] = False + logits_sort.masked_fill_(top_p_mask, -float("inf")) + # Re-sort the probabilities. + logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort) + return logits + + +def apply_temperature( + logits: torch.Tensor, + temp: torch.Tensor, +) -> torch.Tensor: + return logits.div_(temp.unsqueeze(dim=1)) + + +@torch.compile(options={"max-autotune": True}) +def sampling_batch_spec_dec_one_model( + logits: torch.Tensor, + temperatures: torch.Tensor, + top_k: torch.Tensor, + top_p: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + CUDA-graph compatible sampling. Supports mixed sampling params. + + We can't do dynamic kernel selection inside graphs, so this might + be slower than a torch.argmax for greedy requests. This is why advanced + sampling is opt-in for now. + """ + logits = apply_temperature(logits, temperatures) + random_sampled = forward_native(logits, top_k, top_p) + return random_sampled diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py index 4ef4ff4296..139787df44 100644 --- a/tensorrt_llm/_torch/speculative/utils.py +++ b/tensorrt_llm/_torch/speculative/utils.py @@ -76,6 +76,7 @@ def get_spec_metadata(spec_config, hidden_size=model_config.hidden_size, max_num_tokens=max_num_tokens, layers_to_capture=spec_config.eagle3_layers_to_capture, + allow_advanced_sampling=spec_config.allow_advanced_sampling, ) if spec_config.spec_dec_mode.is_save_hidden_states(): if spec_config.eagle3_layers_to_capture is None: diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index cd7858c6f4..b790dc141d 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -619,6 +619,10 @@ class DecodingBaseConfig(StrictBaseModel): # (N = acceptance_window) drops below this value. acceptance_length_threshold: Optional[float] = None + # Prototype. If true, allows non-greedy sampling when speculation is used. Only applicable + # to 1-model code paths; non-greedy sampling is always enabled on 2-model paths. + allow_advanced_sampling: bool = False + # Validate acceptance controls at field level so they run on model creation @field_validator('acceptance_window') @classmethod diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 3b667b15c9..2f27c5dc18 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4307,7 +4307,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): draft_len = 3 spec_config = EagleDecodingConfig(max_draft_len=draft_len, speculative_model_dir=eagle_model_dir, - eagle3_one_model=one_model) + eagle3_one_model=one_model, + allow_advanced_sampling=True) max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN llm = LLM(self.MODEL_PATH, From 64d7796234eff96d5e4e4c85708831a1ebe7d46a Mon Sep 17 00:00:00 2001 From: Faraz <58580514+farazkh80@users.noreply.github.com> Date: Sat, 13 Dec 2025 12:18:10 -0500 Subject: [PATCH 119/172] [None][chore] Add namespace to header to fix tot failure (#9973) --- cpp/tensorrt_llm/thop/cublasScaledMMLut.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/tensorrt_llm/thop/cublasScaledMMLut.h b/cpp/tensorrt_llm/thop/cublasScaledMMLut.h index f190de9a0b..069bd567cf 100644 --- a/cpp/tensorrt_llm/thop/cublasScaledMMLut.h +++ b/cpp/tensorrt_llm/thop/cublasScaledMMLut.h @@ -16,12 +16,16 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace cublas_lut @@ -97,3 +101,5 @@ inline const AlgoListType fp8_algo_list = { } // namespace cublas_lut } // namespace torch_ext + +TRTLLM_NAMESPACE_END From a5a37227d669cabddb074089143334d9bbb69627 Mon Sep 17 00:00:00 2001 From: nvxuanyuc Date: Sat, 13 Dec 2025 18:47:24 -0800 Subject: [PATCH 120/172] [None][feat] Fused kernels (qknormrope + moe routing) and two-model MTP support for glm4moe (#9852) Signed-off-by: Xuanyu Chen --- .../kernels/fusedQKNormRopeKernel.cu | 53 +++-- .../kernels/fusedQKNormRopeKernel.h | 1 + cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp | 10 +- tensorrt_llm/_torch/models/modeling_auto.py | 4 +- tensorrt_llm/_torch/models/modeling_glm.py | 185 ++++++++++++++---- .../_torch/models/modeling_speculative.py | 36 +++- .../_torch/modules/qk_norm_attention.py | 7 +- .../defs/accuracy/references/gsm8k.yaml | 2 + .../defs/accuracy/test_llm_api_pytorch.py | 40 +++- .../test_lists/qa/llm_function_core.txt | 3 + .../thop/parallel/test_fused_qk_norm_rope.py | 30 +-- 11 files changed, 289 insertions(+), 82 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu index 73326af8c4..a73ea79270 100644 --- a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu +++ b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu @@ -66,6 +66,7 @@ __global__ void fusedQKNormRopeKernel( int const num_heads_q, // Number of query heads int const num_heads_k, // Number of key heads int const num_heads_v, // Number of value heads + int const rotary_dim, // Dimension for RoPE float const eps, // Epsilon for RMS normalization __nv_bfloat16 const* q_weight, // RMSNorm weights for query __nv_bfloat16 const* k_weight, // RMSNorm weights for key @@ -184,7 +185,7 @@ __global__ void fusedQKNormRopeKernel( int dim_idx = laneId * numElemsPerThread + i; int half_dim = dim_idx / 2; - float freq = powf(base, -2.0f * half_dim / static_cast(head_dim)); + float freq = powf(base, -2.0f * half_dim / static_cast(rotary_dim)); if (factor != 1.0f) { @@ -212,19 +213,20 @@ __global__ void fusedQKNormRopeKernel( { // Before data exchange with in warp, we need to sync. __syncwarp(); + int pairOffset = (rotary_dim / 2) / numElemsPerThread; // Get the data from the other half of the warp. Fill cos_vals and sin_vals. for (int i = 0; i < numElemsPerThread; i++) { - elements2[i] = __shfl_xor_sync(0xffffffff, elements[i], 16); - if (laneId < 16) + elements2[i] = __shfl_xor_sync(0xffffffff, elements[i], pairOffset); + if (laneId < pairOffset) { elements2[i] = -elements2[i]; } int dim_idx = laneId * numElemsPerThread + i; - dim_idx = (dim_idx * 2) % head_dim; + dim_idx = (dim_idx * 2) % rotary_dim; int half_dim = dim_idx / 2; - float freq = powf(base, -2.0f * half_dim / static_cast(head_dim)); + float freq = powf(base, -2.0f * half_dim / static_cast(rotary_dim)); if (factor != 1.0f) { @@ -251,9 +253,25 @@ __global__ void fusedQKNormRopeKernel( __syncwarp(); } - for (int i = 0; i < numElemsPerThread; i++) + bool const is_full_rope = (rotary_dim == head_dim); + if (is_full_rope) { - elements[i] = (elements[i] * cos_vals[i] + elements2[i] * sin_vals[i]) * attention_factor; + for (int i = 0; i < numElemsPerThread; i++) + { + elements[i] = (elements[i] * cos_vals[i] + elements2[i] * sin_vals[i]) * attention_factor; + } + } + else + { + for (int i = 0; i < numElemsPerThread; i++) + { + int dim_idx = laneId * numElemsPerThread + i; + + if (dim_idx < rotary_dim) + { + elements[i] = (elements[i] * cos_vals[i] + elements2[i] * sin_vals[i]) * attention_factor; + } + } } // Store. @@ -284,14 +302,23 @@ __global__ void fusedQKNormRopeKernel( } void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_q, int const num_heads_k, - int const num_heads_v, int const head_dim, float const eps, void const* q_weight, void const* k_weight, - float const base, bool const interleave, int const* position_ids, float factor, float low, float high, - float attention_factor, cudaStream_t stream, bool is_qk_norm) + int const num_heads_v, int const head_dim, int const rotary_dim, float const eps, void const* q_weight, + void const* k_weight, float const base, bool const interleave, int const* position_ids, float factor, float low, + float high, float attention_factor, cudaStream_t stream, bool is_qk_norm) { if (factor == 1.0f) { TLLM_CHECK(attention_factor == 1.0f); } + + TLLM_CHECK_WITH_INFO(rotary_dim % 2 == 0, "rotary_dim must be even"); + if (!interleave) + { + // To allow warp-level pairing for partial rope + TLLM_CHECK_WITH_INFO( + (rotary_dim * 16) % head_dim == 0, "Unsupported rotary dimension for fusedQKNormRope: %d", rotary_dim); + } + constexpr int blockSize = 256; int const warpsPerBlock = blockSize / 32; @@ -309,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_ case 64: DISPATCH_INTERLEAVE(interleave, INTERLEAVE, { fusedQKNormRopeKernel<64, INTERLEAVE><<>>( - reinterpret_cast<__nv_bfloat16*>(qkv), num_heads_q, num_heads_k, num_heads_v, eps, + reinterpret_cast<__nv_bfloat16*>(qkv), num_heads_q, num_heads_k, num_heads_v, rotary_dim, eps, reinterpret_cast<__nv_bfloat16 const*>(q_weight), reinterpret_cast<__nv_bfloat16 const*>(k_weight), base, position_ids, num_tokens, factor, low, high, attention_factor, is_qk_norm); }); @@ -317,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_ case 128: DISPATCH_INTERLEAVE(interleave, INTERLEAVE, { fusedQKNormRopeKernel<128, INTERLEAVE><<>>( - reinterpret_cast<__nv_bfloat16*>(qkv), num_heads_q, num_heads_k, num_heads_v, eps, + reinterpret_cast<__nv_bfloat16*>(qkv), num_heads_q, num_heads_k, num_heads_v, rotary_dim, eps, reinterpret_cast<__nv_bfloat16 const*>(q_weight), reinterpret_cast<__nv_bfloat16 const*>(k_weight), base, position_ids, num_tokens, factor, low, high, attention_factor, is_qk_norm); }); @@ -325,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_ case 256: DISPATCH_INTERLEAVE(interleave, INTERLEAVE, { fusedQKNormRopeKernel<256, INTERLEAVE><<>>( - reinterpret_cast<__nv_bfloat16*>(qkv), num_heads_q, num_heads_k, num_heads_v, eps, + reinterpret_cast<__nv_bfloat16*>(qkv), num_heads_q, num_heads_k, num_heads_v, rotary_dim, eps, reinterpret_cast<__nv_bfloat16 const*>(q_weight), reinterpret_cast<__nv_bfloat16 const*>(k_weight), base, position_ids, num_tokens, factor, low, high, attention_factor, is_qk_norm); }); diff --git a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h index 7dab7dbbb2..c976f2a0fe 100644 --- a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h +++ b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h @@ -33,6 +33,7 @@ void launchFusedQKNormRope( int const num_heads_k, // Number of key heads int const num_heads_v, // Number of value heads int const head_dim, // Dimension per head + int const rotary_dim, // Dimension for RoPE float const eps, // Epsilon for RMS normalization void const* q_weight, // RMSNorm weights for query [head_dim] void const* k_weight, // RMSNorm weights for key [head_dim] diff --git a/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp b/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp index 14bf8578dc..a6635c0285 100644 --- a/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp +++ b/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp @@ -34,6 +34,7 @@ void fused_qk_norm_rope( int64_t num_heads_k, // Number of key heads int64_t num_heads_v, // Number of value heads int64_t head_dim, // Dimension per head + int64_t rotary_dim, // Dimension for RoPE double eps, // Epsilon for RMS normalization torch::Tensor& q_weight, // RMSNorm weights for query [head_dim] torch::Tensor& k_weight, // RMSNorm weights for key [head_dim] @@ -72,9 +73,9 @@ void fused_qk_norm_rope( tensorrt_llm::kernels::launchFusedQKNormRope(reinterpret_cast<__nv_bfloat16*>(qkv.data_ptr()), static_cast(num_tokens), static_cast(num_heads_q), static_cast(num_heads_k), - static_cast(num_heads_v), static_cast(head_dim), static_cast(eps), - reinterpret_cast<__nv_bfloat16*>(q_weight.data_ptr()), reinterpret_cast<__nv_bfloat16*>(k_weight.data_ptr()), - static_cast(base), + static_cast(num_heads_v), static_cast(head_dim), static_cast(rotary_dim), + static_cast(eps), reinterpret_cast<__nv_bfloat16*>(q_weight.data_ptr()), + reinterpret_cast<__nv_bfloat16*>(k_weight.data_ptr()), static_cast(base), !is_neox, // interleave reinterpret_cast(position_ids.data_ptr()), static_cast(factor), static_cast(low), static_cast(high), static_cast(attention_factor), stream, is_qk_norm); @@ -84,7 +85,8 @@ void fused_qk_norm_rope( TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( - "fused_qk_norm_rope(Tensor(a!) qkv, int num_heads_q, int num_heads_k, int num_heads_v, int head_dim, float " + "fused_qk_norm_rope(Tensor(a!) qkv, int num_heads_q, int num_heads_k, int num_heads_v, int head_dim, int " + "rotary_dim, float " "eps, Tensor q_weight, Tensor k_weight, float base, bool is_neox, Tensor position_ids, float factor, float " "low, float high, float attention_factor, bool is_qk_norm) -> ()"); } diff --git a/tensorrt_llm/_torch/models/modeling_auto.py b/tensorrt_llm/_torch/models/modeling_auto.py index ff48edc5cb..84c8f73c5a 100644 --- a/tensorrt_llm/_torch/models/modeling_auto.py +++ b/tensorrt_llm/_torch/models/modeling_auto.py @@ -31,7 +31,9 @@ class AutoModelForCausalLM(Generic[TModel, TConfig]): "") # Strip the appended EAGLE3 if hasattr(config.pretrained_config, "draft_vocab_size"): model_arch = "EAGLE3" + model_arch - if model_arch == "DeepseekV3ForCausalLM" and config.spec_config is not None and config.spec_config.max_draft_len == 0: + if model_arch in ( + "DeepseekV3ForCausalLM", "Glm4MoeForCausalLM" + ) and config.spec_config is not None and config.spec_config.max_draft_len == 0: model_arch = "MTPDraftModelForCausalLM" cls = MODEL_CLASS_MAPPING.get(model_arch) diff --git a/tensorrt_llm/_torch/models/modeling_glm.py b/tensorrt_llm/_torch/models/modeling_glm.py index be300bcf08..868e43195b 100644 --- a/tensorrt_llm/_torch/models/modeling_glm.py +++ b/tensorrt_llm/_torch/models/modeling_glm.py @@ -1,3 +1,4 @@ +import inspect import math import os from typing import Dict, List, Optional, Tuple @@ -8,14 +9,10 @@ from tqdm import tqdm from transformers import PretrainedConfig from tensorrt_llm._ipc_utils import can_access_peer -from tensorrt_llm._utils import get_sm_version, is_sm_100f +from tensorrt_llm._utils import get_sm_version from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization.mode import QuantAlgo -from tensorrt_llm.quantization.utils.fp8_utils import ( - resmooth_to_fp8_e8m0, - transform_sf_into_required_layout, -) from ..attention_backend import AttentionMetadata from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams @@ -29,7 +26,7 @@ from ..distributed import ( from ..model_config import ModelConfig from ..modules.decoder_layer import DecoderLayer from ..modules.embedding import Embedding -from ..modules.fused_moe import MoEWeightLoadingMode, create_moe +from ..modules.fused_moe import MoE, MoEWeightLoadingMode, create_moe from ..modules.gated_mlp import GatedMLP from ..modules.linear import Linear, TensorParallelMode from ..modules.multi_stream_utils import maybe_execute_in_parallel @@ -39,7 +36,142 @@ from ..speculative import SpecMetadata from ..utils import AuxStreamType, EventType, Fp4QuantizedTensor from .modeling_deepseekv3 import DeepseekV3Gate, DeepseekV3MTPHead, moe_reduce_add_shared_output from .modeling_speculative import SpecDecOneEngineForCausalLM -from .modeling_utils import DecoderModel, EagerFusionConfig, _load_weights_impl, register_auto_model +from .modeling_utils import ( + DecoderModel, + EagerFusionConfig, + duplicate_kv_weight, + filter_weights, + register_auto_model, +) + + +class Glm4WeightLoader: + def __init__(self, model, is_draft_model: bool = False): + self.model = model + self.config = model.config + self.model_config = model.model_config + self.is_draft_model = is_draft_model + + def load_weights(self, weights: Dict, allow_partial_loading: bool = False): + def rename_moe_weight(weights: Dict, rename_rules: Dict): + result = {} + for key, value in weights.items(): + new_key = key + for old, new in rename_rules.items(): + new_key = new_key.replace(old, new) + result[new_key] = value + return result + + params_map = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } + all_named_modules = dict(self.model.named_modules()) + + tp_size = ( + 1 + if self.model_config.mapping.enable_attention_dp + else self.model_config.mapping.tp_size + ) + num_kv_heads = ( + self.config.num_key_value_heads + if hasattr(self.config, "num_key_value_heads") + and self.config.num_key_value_heads is not None + else self.config.num_attention_heads + ) + + for name, module in tqdm(all_named_modules.items(), desc="Loading weights"): + if len(module._parameters) <= 0 or name.startswith("draft_model"): + continue + else: + names = name.split(".") + if "model.layers" in name and int(names[2]) >= self.config.num_hidden_layers: + mtp_layer_idx = int(names[2]) - self.config.num_hidden_layers + names[2] = str( + mtp_layer_idx % self.config.num_nextn_predict_layers + + self.config.num_hidden_layers + ) + name = ".".join(names) + + if names[-1] in params_map: + module_weights = [] + for new_name in params_map[names[-1]]: + fw = filter_weights(".".join(names[:-1] + [new_name]), weights) + if new_name in ["k_proj", "v_proj"]: + num_kv_heads_list = ( + [num_kv_heads] * len(fw) + if isinstance(num_kv_heads, int) + else num_kv_heads + ) + fw = { + k: duplicate_kv_weight( + weight=v[:], + num_kv_heads=num_kv_heads_list[i], + tensor_parallel_size=tp_size, + ) + if k in ["weight", "bias"] + else v + for i, (k, v) in enumerate(fw.items()) + } + module_weights.append(fw) + module.load_weights(weights=module_weights) + elif names[-1] == "experts": + module_weights = filter_weights(name, weights) + module_weights = rename_moe_weight( + module_weights, + { + "down_proj": "w2", + "up_proj": "w3", + "gate_proj": "w1", + }, + ) + module.load_weights( + weights=[module_weights], allow_partial_loading=allow_partial_loading + ) + elif names[-1] == "backend" and isinstance(module, MoE): + # Special case: ConfigurableMoE.backend (TRTLLMGenFusedMoE) + # Currently saved MoE weights don't include 'backend' in their names. + # After MoE refactoring, ConfigurableMoE now has a backend submodule, + # and weights loading is done in the backend, so module name includes '.backend'. + # We need to use parent module name (without .backend) to match saved weight names. + # After MoE refactoring is fully complete, all paths will follow this branch. + parent_name = ".".join(names[:-1]) + module_weights = filter_weights(parent_name, weights) + module_weights = rename_moe_weight( + module_weights, + { + "down_proj": "w2", + "up_proj": "w3", + "gate_proj": "w1", + }, + ) + module.load_weights( + weights=[module_weights], allow_partial_loading=allow_partial_loading + ) + elif names[-1] == "self_attn": + continue + elif names[-1] == "next_layer_layernorm": + continue + else: + module_weights = filter_weights(name, weights) + if hasattr(module, "load_weights"): + args = inspect.getfullargspec(module.load_weights).args + if "allow_partial_loading" not in args: + assert not allow_partial_loading, ( + "allow_partial_loading is not supported for this model" + ) + module.load_weights(weights=[module_weights]) + else: + module.load_weights( + weights=[module_weights], + allow_partial_loading=allow_partial_loading, + ) + else: + for n, p in module.named_parameters(): + if not allow_partial_loading: + assert n in module_weights + if n in module_weights: + p.data.copy_(module_weights[n][:]) class Glm4Attention(QKNormRoPEAttention): @@ -61,7 +193,7 @@ class Glm4Attention(QKNormRoPEAttention): max_position_embeddings=config.max_position_embeddings, bias=config.attention_bias, pos_embd_params=pos_embd_params, - fuse_qk_norm_rope=False, + fuse_qk_norm_rope=True, layer_idx=layer_idx, dtype=config.torch_dtype, dense_bias=False, @@ -98,7 +230,7 @@ class Glm4MoE(nn.Module): topk_group=config.topk_group, routed_scaling_factor=config.routed_scaling_factor, dtype=dtype, - fuse_routing_kernel=False, + fuse_routing_kernel=True, apply_routing=False, moe_backend=model_config.moe_backend, ) @@ -872,40 +1004,11 @@ class Glm4MoeForCausalLM(SpecDecOneEngineForCausalLM[Glm4Model, PretrainedConfig **kwargs, ) - def load_weights(self, weights: Dict): - # model.layers.91.mlp.experts.75.up_proj.weight_scale_2 - _load_weights_impl( - self, - weights, - params_map={ - r"(?!.*shared_experts)(?=.*experts?)(.*?)up_proj(.*)": r"\1w3\2", - r"(?!.*shared_experts)(?=.*experts?)(.*?)down_proj(.*)": r"\1w2\2", - r"(?!.*shared_experts)(?=.*experts?)(.*?)gate_proj(.*)": r"\1w1\2", - }, - ) + def load_weights(self, weights: Dict, allow_partial_loading: bool = False): + weight_loader = Glm4WeightLoader(self) + weight_loader.load_weights(weights, allow_partial_loading=allow_partial_loading) def post_load_weights(self): - all_named_modules = dict(self.model.named_modules()) - for name, module in tqdm(all_named_modules.items(), desc="Post loading weights"): - if len(module._parameters) <= 0 or name.startswith("draft_model"): - continue - else: - if ( - self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales() - and is_sm_100f() - and hasattr(module, "weight_scale") - ): - weight, weight_scale = resmooth_to_fp8_e8m0(module.weight, module.weight_scale) - transfromed_scale = transform_sf_into_required_layout( - weight_scale, - mn=weight.shape[0], - k=weight.shape[1], - recipe=(1, 128, 128), - is_sfa=False, - ) - module.weight = nn.Parameter(weight, requires_grad=False) - module.weight_scale = nn.Parameter(transfromed_scale, requires_grad=False) - for idx, layer in enumerate(self.model.layers[: self.config.num_hidden_layers]): if idx == self.config.num_hidden_layers - 1: layer.next_layer_layernorm = self.model.norm diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index a94e288172..17d3aba15f 100755 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -428,12 +428,22 @@ class MTPDraftModel(nn.Module): torch.cuda.Stream]): super().__init__() # Import here to avoid circular import - from .modeling_deepseekv3 import DeepseekV3MTP - - mtp_layer = DeepseekV3MTP(model_config, - layer_idx, - aux_stream_dict, - is_separate_draft_engine=True) + model_type = model_config.pretrained_config.model_type + if model_type == "glm4_moe": + from .modeling_glm import Glm4MTP + mtp_layer = Glm4MTP(model_config, + layer_idx, + aux_stream_dict, + is_separate_draft_engine=True) + elif model_type in ["deepseek_v3", "deepseek_v32"]: + from .modeling_deepseekv3 import DeepseekV3MTP + mtp_layer = DeepseekV3MTP(model_config, + layer_idx, + aux_stream_dict, + is_separate_draft_engine=True) + else: + raise ValueError( + f"MTPDraftModel does not support model_type: {model_type}") setattr(self, f"layers.{layer_idx}", mtp_layer) self.layers = mtp_layer self.layer_idx = layer_idx @@ -493,8 +503,18 @@ class MTPDraftModelForCausalLM(DecoderModelForCausalLM[MTPDraftModel, def load_weights(self, weights: Dict): # Import here to avoid circular import - from .modeling_deepseekv3 import DeepseekV3WeightLoader - weight_loader = DeepseekV3WeightLoader(self, is_draft_model=True) + model_type = self.model_config.pretrained_config.model_type + match model_type: + case "glm4_moe": + from .modeling_glm import Glm4WeightLoader + weight_loader = Glm4WeightLoader(self, is_draft_model=True) + case "deepseek_v3" | "deepseek_v32": + from .modeling_deepseekv3 import DeepseekV3WeightLoader + weight_loader = DeepseekV3WeightLoader(self, + is_draft_model=True) + case _: + raise ValueError( + f"Model type {model_type} not supported for MTP") weight_loader.load_weights(weights) def load_weights_from_target_model(self, diff --git a/tensorrt_llm/_torch/modules/qk_norm_attention.py b/tensorrt_llm/_torch/modules/qk_norm_attention.py index 5a794783a1..771e7f79a5 100644 --- a/tensorrt_llm/_torch/modules/qk_norm_attention.py +++ b/tensorrt_llm/_torch/modules/qk_norm_attention.py @@ -229,9 +229,14 @@ class QKNormRoPEAttention(Attention): def apply_qk_norm_rope(self, qkv, position_ids): factor, low, high, attention_factor = compute_yarn_parameters( self.pretrained_config) + + partial_rotary_factor = self.pretrained_config.partial_rotary_factor if hasattr( + self.pretrained_config, "partial_rotary_factor") else 1.0 + rotary_dim = int(self.head_dim * partial_rotary_factor) + torch.ops.trtllm.fused_qk_norm_rope( qkv, self.num_heads, self.num_key_value_heads, - self.num_key_value_heads, self.head_dim, + self.num_key_value_heads, self.head_dim, rotary_dim, self.q_norm.variance_epsilon, self.q_norm.weight, self.k_norm.weight, self.pos_embd_params.rope.theta, self.pos_embd_params.is_neox, diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 33f7dddc6b..20143e4540 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -272,6 +272,8 @@ ByteDance-Seed/Seed-OSS-36B-Instruct: - accuracy: 90.8 zai-org/GLM-4.6: - accuracy: 81.3 + - spec_dec_algo: MTP + accuracy: 81.3 - quant_algo: NVFP4 spec_dec_algo: MTP accuracy: 88.0 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2f27c5dc18..538277ba0b 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2869,8 +2869,11 @@ class TestGLM4_6(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(4) @pytest.mark.parametrize( "tp_size,pp_size,mtp_nextn,cuda_graph,overlap_scheduler,chunked_prefill,max_batch_size,moe_backend", - [pytest.param(4, 1, 2, True, True, True, 16, "CUTLASS")], - ids=["throughput"]) + [ + pytest.param(4, 1, 2, True, True, True, 16, "CUTLASS"), + pytest.param(4, 1, 2, True, True, True, 16, "TRTLLM") + ], + ids=["throughput", "throughput_trtllm"]) def test_nvfp4_multi_gpus(self, tp_size, pp_size, mtp_nextn, cuda_graph, overlap_scheduler, chunked_prefill, max_batch_size, moe_backend): @@ -2897,6 +2900,39 @@ class TestGLM4_6(LlmapiAccuracyTestHarness): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize( + "tp_size,cuda_graph,overlap_scheduler,chunked_prefill,max_batch_size,moe_backend", + [ + pytest.param(4, True, True, True, 16, "CUTLASS"), + pytest.param(4, True, True, True, 16, "TRTLLM"), + ], + ids=["2model", "2model_trtllm"]) + def test_nvfp4_2_model_mtp(self, tp_size, cuda_graph, overlap_scheduler, + chunked_prefill, max_batch_size, moe_backend): + model_path = f"{llm_models_root()}/glm-4.6-fp4" + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3, + mtp_eagle_one_model=False, + speculative_model_dir=model_path) + + with LLM(model_path, + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + speculative_config=mtp_config, + enable_chunked_prefill=chunked_prefill) as llm: + + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.timeout(7200) @pytest.mark.skip_less_device_memory(100000) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 5b5ad88d3b..eab8fea284 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -507,6 +507,9 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput] +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm] +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] diff --git a/tests/unittest/_torch/thop/parallel/test_fused_qk_norm_rope.py b/tests/unittest/_torch/thop/parallel/test_fused_qk_norm_rope.py index ab8db650a4..565f8b3b58 100644 --- a/tests/unittest/_torch/thop/parallel/test_fused_qk_norm_rope.py +++ b/tests/unittest/_torch/thop/parallel/test_fused_qk_norm_rope.py @@ -8,8 +8,8 @@ from tensorrt_llm._torch.modules.rotary_embedding import RotaryEmbedding @torch.inference_mode() def torch_ref_rms_norm_rope(qkv, num_heads_q, num_heads_k, num_heads_v, - head_dim, eps, q_weight, k_weight, base, is_neox, - position_ids): + head_dim, rotary_dim, eps, q_weight, k_weight, base, + is_neox, position_ids): """ PyTorch reference implementation of RMSNorm+RoPE for verification. @@ -22,6 +22,7 @@ def torch_ref_rms_norm_rope(qkv, num_heads_q, num_heads_k, num_heads_v, num_heads_k: Number of key heads num_heads_v: Number of value heads (unused for normalization/RoPE but needed for tensor splitting) head_dim: Dimension of each head + rotary_dim: Dimension for RoPE eps: Epsilon value for RMS normalization q_weight: RMSNorm weights for query [head_dim] k_weight: RMSNorm weights for key [head_dim] @@ -65,7 +66,7 @@ def torch_ref_rms_norm_rope(qkv, num_heads_q, num_heads_k, num_heads_v, # Create and apply RotaryEmbedding module rope_params = RopeParams( - dim=head_dim, # Set the rotary dimension to match the head dimension + dim=rotary_dim, # Set the rotary dimension theta=base, # Base value for RoPE calculations max_positions=8192 # Large enough for any reasonable hidden size ) @@ -88,10 +89,12 @@ num_heads_groups = [ (16, 8, 8), # Qwen3-0.6B, Qwen3-1.7B (32, 8, 8), # Qwen3-4B, Qwen3-8B, Qwen3-30B-A3B (40, 8, 8), # Qwen3-14B - (64, 8, 8) # Qwen3-32B, Qwen3-235B-A22B + (64, 8, 8), # Qwen3-32B, Qwen3-235B-A22B + (24, 8, 8), # GLM 4.6 ] num_tokens_list = [1, 3, 8, 32, 256] is_neox_list = [False, True] +partial_rotary_factor_list = [1.0, 0.5] dtypes = [torch.bfloat16] # TODO: support float16 @@ -100,8 +103,9 @@ dtypes = [torch.bfloat16] # TODO: support float16 @pytest.mark.parametrize("num_tokens", num_tokens_list) @pytest.mark.parametrize("is_neox", is_neox_list) @pytest.mark.parametrize("dtype", dtypes) -def test_fused_qk_norm_rope(head_dim, num_heads_group, num_tokens, is_neox, - dtype): +@pytest.mark.parametrize("partial_rotary_factor", partial_rotary_factor_list) +def test_fused_qk_norm_rope(head_dim, num_heads_group, num_tokens, + partial_rotary_factor, is_neox, dtype): """ Test the fused QK RMSNorm + RoPE operation with various configurations. @@ -143,18 +147,20 @@ def test_fused_qk_norm_rope(head_dim, num_heads_group, num_tokens, is_neox, base = 10000.0 factor, low, high, attention_factor = 1.0, 0, 0, 1.0 + rotary_dim = int(head_dim * partial_rotary_factor) # Run the custom fusedQKNormRope operation torch.ops.trtllm.fused_qk_norm_rope(qkv, num_heads_q, num_heads_k, - num_heads_v, head_dim, eps, q_weight, - k_weight, base, is_neox, position_ids, - factor, low, high, attention_factor, - True) + num_heads_v, head_dim, rotary_dim, eps, + q_weight, k_weight, base, is_neox, + position_ids, factor, low, high, + attention_factor, True) output = qkv # This op is inplace # Compute reference output using TensorRT LLM modules ref_output = torch_ref_rms_norm_rope(qkv_copy, num_heads_q, num_heads_k, - num_heads_v, head_dim, eps, q_weight, - k_weight, base, is_neox, position_ids) + num_heads_v, head_dim, rotary_dim, eps, + q_weight, k_weight, base, is_neox, + position_ids) # Compare outputs from custom kernel vs reference implementation torch.testing.assert_close( From f6b0ddd61df561f7e855a325073716c5fc215d12 Mon Sep 17 00:00:00 2001 From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> Date: Sun, 14 Dec 2025 03:29:59 +0000 Subject: [PATCH 121/172] [None][infra] Check in most recent lock file from nightly pipeline Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> --- .../examples/auto_deploy/poetry.lock | 6 +- .../examples/draft_target_model/poetry.lock | 6 +- security_scanning/examples/eagle/poetry.lock | 6 +- .../llm-eval/lm-eval-harness/poetry.lock | 6 +- .../examples/lookahead/poetry.lock | 6 +- security_scanning/examples/medusa/poetry.lock | 6 +- .../models/contrib/baichuan/poetry.lock | 6 +- .../examples/models/contrib/bloom/poetry.lock | 6 +- .../models/contrib/chatglm-6b/poetry.lock | 6 +- .../models/contrib/chatglm2-6b/poetry.lock | 6 +- .../contrib/chatglm3-6b-32k/poetry.lock | 6 +- .../examples/models/contrib/dbrx/poetry.lock | 6 +- .../models/contrib/deepseek_v1/poetry.lock | 6 +- .../models/contrib/deepseek_v2/poetry.lock | 6 +- .../models/contrib/falcon/poetry.lock | 6 +- .../examples/models/contrib/gptj/poetry.lock | 6 +- .../models/contrib/gptneox/poetry.lock | 6 +- .../examples/models/contrib/grok/poetry.lock | 6 +- .../models/contrib/internlm/poetry.lock | 6 +- .../examples/models/contrib/jais/poetry.lock | 6 +- .../examples/models/contrib/mpt/poetry.lock | 6 +- .../examples/models/contrib/opt/poetry.lock | 6 +- .../models/contrib/skywork/poetry.lock | 6 +- .../examples/models/contrib/smaug/poetry.lock | 6 +- .../examples/models/contrib/stdit/poetry.lock | 12 +-- .../examples/models/core/commandr/poetry.lock | 6 +- .../examples/models/core/gemma/poetry.lock | 6 +- .../examples/models/core/glm-4-9b/poetry.lock | 6 +- .../examples/models/core/gpt/poetry.lock | 6 +- .../examples/models/core/llama/poetry.lock | 6 +- .../examples/models/core/mamba/poetry.lock | 6 +- .../examples/models/core/nemotron/poetry.lock | 6 +- .../examples/models/core/phi/poetry.lock | 6 +- .../examples/models/core/qwen/poetry.lock | 6 +- .../models/core/qwen2audio/poetry.lock | 6 +- .../examples/models/core/qwenvl/poetry.lock | 6 +- .../models/core/recurrentgemma/poetry.lock | 6 +- .../examples/models/core/whisper/poetry.lock | 6 +- security_scanning/examples/ngram/poetry.lock | 6 +- .../examples/quantization/poetry.lock | 6 +- .../examples/redrafter/poetry.lock | 6 +- .../examples/trtllm-eval/poetry.lock | 6 +- security_scanning/metadata.json | 4 +- security_scanning/poetry.lock | 75 ++++++++++++++++++- security_scanning/pyproject.toml | 1 + .../tests/integration/defs/perf/poetry.lock | 6 +- security_scanning/triton_backend/poetry.lock | 6 +- 47 files changed, 209 insertions(+), 141 deletions(-) diff --git a/security_scanning/examples/auto_deploy/poetry.lock b/security_scanning/examples/auto_deploy/poetry.lock index 1ff1af731e..3b26779361 100644 --- a/security_scanning/examples/auto_deploy/poetry.lock +++ b/security_scanning/examples/auto_deploy/poetry.lock @@ -3613,13 +3613,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/draft_target_model/poetry.lock b/security_scanning/examples/draft_target_model/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/draft_target_model/poetry.lock +++ b/security_scanning/examples/draft_target_model/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/eagle/poetry.lock b/security_scanning/examples/eagle/poetry.lock index ad5f201ef1..85b32a8def 100644 --- a/security_scanning/examples/eagle/poetry.lock +++ b/security_scanning/examples/eagle/poetry.lock @@ -1796,13 +1796,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock b/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock index 70bae1549a..63dc47c612 100644 --- a/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock +++ b/security_scanning/examples/llm-eval/lm-eval-harness/poetry.lock @@ -3251,13 +3251,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/lookahead/poetry.lock b/security_scanning/examples/lookahead/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/lookahead/poetry.lock +++ b/security_scanning/examples/lookahead/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/medusa/poetry.lock b/security_scanning/examples/medusa/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/medusa/poetry.lock +++ b/security_scanning/examples/medusa/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/baichuan/poetry.lock b/security_scanning/examples/models/contrib/baichuan/poetry.lock index 803be3fd10..bf963136e4 100644 --- a/security_scanning/examples/models/contrib/baichuan/poetry.lock +++ b/security_scanning/examples/models/contrib/baichuan/poetry.lock @@ -1987,13 +1987,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/bloom/poetry.lock b/security_scanning/examples/models/contrib/bloom/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/models/contrib/bloom/poetry.lock +++ b/security_scanning/examples/models/contrib/bloom/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock index 63cdf97138..f42d9b0a84 100644 --- a/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm-6b/poetry.lock @@ -1912,13 +1912,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock index 63cdf97138..f42d9b0a84 100644 --- a/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm2-6b/poetry.lock @@ -1912,13 +1912,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock index 63cdf97138..f42d9b0a84 100644 --- a/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock +++ b/security_scanning/examples/models/contrib/chatglm3-6b-32k/poetry.lock @@ -1912,13 +1912,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/dbrx/poetry.lock b/security_scanning/examples/models/contrib/dbrx/poetry.lock index 55afe0d7b0..9f55ff0d16 100644 --- a/security_scanning/examples/models/contrib/dbrx/poetry.lock +++ b/security_scanning/examples/models/contrib/dbrx/poetry.lock @@ -1794,13 +1794,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v1/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock index 2f76be099e..6bfa988eb0 100644 --- a/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock +++ b/security_scanning/examples/models/contrib/deepseek_v2/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/falcon/poetry.lock b/security_scanning/examples/models/contrib/falcon/poetry.lock index 1be44e0473..20cd8cc6e7 100644 --- a/security_scanning/examples/models/contrib/falcon/poetry.lock +++ b/security_scanning/examples/models/contrib/falcon/poetry.lock @@ -1854,13 +1854,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/gptj/poetry.lock b/security_scanning/examples/models/contrib/gptj/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/contrib/gptj/poetry.lock +++ b/security_scanning/examples/models/contrib/gptj/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/gptneox/poetry.lock b/security_scanning/examples/models/contrib/gptneox/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/contrib/gptneox/poetry.lock +++ b/security_scanning/examples/models/contrib/gptneox/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/grok/poetry.lock b/security_scanning/examples/models/contrib/grok/poetry.lock index 7bda39c20b..2b9d241cf3 100644 --- a/security_scanning/examples/models/contrib/grok/poetry.lock +++ b/security_scanning/examples/models/contrib/grok/poetry.lock @@ -2707,13 +2707,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/internlm/poetry.lock b/security_scanning/examples/models/contrib/internlm/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/models/contrib/internlm/poetry.lock +++ b/security_scanning/examples/models/contrib/internlm/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/jais/poetry.lock b/security_scanning/examples/models/contrib/jais/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/models/contrib/jais/poetry.lock +++ b/security_scanning/examples/models/contrib/jais/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/mpt/poetry.lock b/security_scanning/examples/models/contrib/mpt/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/contrib/mpt/poetry.lock +++ b/security_scanning/examples/models/contrib/mpt/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/opt/poetry.lock b/security_scanning/examples/models/contrib/opt/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/contrib/opt/poetry.lock +++ b/security_scanning/examples/models/contrib/opt/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/skywork/poetry.lock b/security_scanning/examples/models/contrib/skywork/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/models/contrib/skywork/poetry.lock +++ b/security_scanning/examples/models/contrib/skywork/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/smaug/poetry.lock b/security_scanning/examples/models/contrib/smaug/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/models/contrib/smaug/poetry.lock +++ b/security_scanning/examples/models/contrib/smaug/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/contrib/stdit/poetry.lock b/security_scanning/examples/models/contrib/stdit/poetry.lock index 5e72560b56..52b7f6995d 100644 --- a/security_scanning/examples/models/contrib/stdit/poetry.lock +++ b/security_scanning/examples/models/contrib/stdit/poetry.lock @@ -147,21 +147,21 @@ typecheck = ["mypy"] [[package]] name = "beartype" -version = "0.22.8" +version = "0.22.9" description = "Unbearably fast near-real-time pure-Python runtime-static type-checker." optional = false python-versions = ">=3.10" files = [ - {file = "beartype-0.22.8-py3-none-any.whl", hash = "sha256:b832882d04e41a4097bab9f63e6992bc6de58c414ee84cba9b45b67314f5ab2e"}, - {file = "beartype-0.22.8.tar.gz", hash = "sha256:b19b21c9359722ee3f7cc433f063b3e13997b27ae8226551ea5062e621f61165"}, + {file = "beartype-0.22.9-py3-none-any.whl", hash = "sha256:d16c9bbc61ea14637596c5f6fbff2ee99cbe3573e46a716401734ef50c3060c2"}, + {file = "beartype-0.22.9.tar.gz", hash = "sha256:8f82b54aa723a2848a56008d18875f91c1db02c32ef6a62319a002e3e25a975f"}, ] [package.extras] -dev = ["autoapi (>=0.9.0)", "celery", "click", "coverage (>=5.5)", "docutils (>=0.22.0)", "equinox", "fastmcp", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "nuitka (>=1.2.6)", "numba", "numpy", "pandera (>=0.26.0)", "poetry", "polars", "pydata-sphinx-theme (<=0.7.2)", "pygments", "pyright (>=1.1.370)", "pytest (>=6.2.0)", "redis", "rich-click", "setuptools", "sphinx", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)", "sqlalchemy", "torch", "tox (>=3.20.1)", "typer", "typing-extensions (>=3.10.0.0)", "xarray"] +dev = ["autoapi (>=0.9.0)", "celery", "click", "coverage (>=5.5)", "docutils (>=0.22.0)", "equinox", "fastmcp", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "nuitka (>=1.2.6)", "numba", "numpy", "pandera (>=0.26.0)", "poetry", "polars", "pydata-sphinx-theme (<=0.7.2)", "pygments", "pyinstaller", "pyright (>=1.1.370)", "pytest (>=6.2.0)", "redis", "rich-click", "setuptools", "sphinx", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)", "sqlalchemy", "torch", "tox (>=3.20.1)", "typer", "typing-extensions (>=3.10.0.0)", "xarray"] doc-ghp = ["mkdocs-material[imaging] (>=9.6.0)", "mkdocstrings-python (>=1.16.0)", "mkdocstrings-python-xref (>=1.16.0)"] doc-rtd = ["autoapi (>=0.9.0)", "pydata-sphinx-theme (<=0.7.2)", "setuptools", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)"] -test = ["celery", "click", "coverage (>=5.5)", "docutils (>=0.22.0)", "equinox", "fastmcp", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "nuitka (>=1.2.6)", "numba", "numpy", "pandera (>=0.26.0)", "poetry", "polars", "pygments", "pyright (>=1.1.370)", "pytest (>=6.2.0)", "redis", "rich-click", "sphinx", "sqlalchemy", "torch", "tox (>=3.20.1)", "typer", "typing-extensions (>=3.10.0.0)", "xarray"] -test-tox = ["celery", "click", "docutils (>=0.22.0)", "equinox", "fastmcp", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "nuitka (>=1.2.6)", "numba", "numpy", "pandera (>=0.26.0)", "poetry", "polars", "pygments", "pyright (>=1.1.370)", "pytest (>=6.2.0)", "redis", "rich-click", "sphinx", "sqlalchemy", "torch", "typer", "typing-extensions (>=3.10.0.0)", "xarray"] +test = ["celery", "click", "coverage (>=5.5)", "docutils (>=0.22.0)", "equinox", "fastmcp", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "nuitka (>=1.2.6)", "numba", "numpy", "pandera (>=0.26.0)", "poetry", "polars", "pygments", "pyinstaller", "pyright (>=1.1.370)", "pytest (>=6.2.0)", "redis", "rich-click", "sphinx", "sqlalchemy", "torch", "tox (>=3.20.1)", "typer", "typing-extensions (>=3.10.0.0)", "xarray"] +test-tox = ["celery", "click", "docutils (>=0.22.0)", "equinox", "fastmcp", "jax[cpu]", "jaxtyping", "langchain", "mypy (>=0.800)", "nuitka (>=1.2.6)", "numba", "numpy", "pandera (>=0.26.0)", "poetry", "polars", "pygments", "pyinstaller", "pyright (>=1.1.370)", "pytest (>=6.2.0)", "redis", "rich-click", "sphinx", "sqlalchemy", "torch", "typer", "typing-extensions (>=3.10.0.0)", "xarray"] test-tox-coverage = ["coverage (>=5.5)"] [[package]] diff --git a/security_scanning/examples/models/core/commandr/poetry.lock b/security_scanning/examples/models/core/commandr/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/core/commandr/poetry.lock +++ b/security_scanning/examples/models/core/commandr/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/gemma/poetry.lock b/security_scanning/examples/models/core/gemma/poetry.lock index 6e4a158c8c..f4dc61f2fe 100644 --- a/security_scanning/examples/models/core/gemma/poetry.lock +++ b/security_scanning/examples/models/core/gemma/poetry.lock @@ -2735,13 +2735,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/glm-4-9b/poetry.lock b/security_scanning/examples/models/core/glm-4-9b/poetry.lock index 63cdf97138..f42d9b0a84 100644 --- a/security_scanning/examples/models/core/glm-4-9b/poetry.lock +++ b/security_scanning/examples/models/core/glm-4-9b/poetry.lock @@ -1912,13 +1912,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/gpt/poetry.lock b/security_scanning/examples/models/core/gpt/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/models/core/gpt/poetry.lock +++ b/security_scanning/examples/models/core/gpt/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/llama/poetry.lock b/security_scanning/examples/models/core/llama/poetry.lock index 221204bc96..308a7d11dd 100644 --- a/security_scanning/examples/models/core/llama/poetry.lock +++ b/security_scanning/examples/models/core/llama/poetry.lock @@ -1854,13 +1854,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/mamba/poetry.lock b/security_scanning/examples/models/core/mamba/poetry.lock index a1d4fd8d28..d71d39a59d 100644 --- a/security_scanning/examples/models/core/mamba/poetry.lock +++ b/security_scanning/examples/models/core/mamba/poetry.lock @@ -1854,13 +1854,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/nemotron/poetry.lock b/security_scanning/examples/models/core/nemotron/poetry.lock index ddb42f46da..cca2b3340d 100644 --- a/security_scanning/examples/models/core/nemotron/poetry.lock +++ b/security_scanning/examples/models/core/nemotron/poetry.lock @@ -1742,13 +1742,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/phi/poetry.lock b/security_scanning/examples/models/core/phi/poetry.lock index 8aace7f280..0443d30f55 100644 --- a/security_scanning/examples/models/core/phi/poetry.lock +++ b/security_scanning/examples/models/core/phi/poetry.lock @@ -1805,13 +1805,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock index e0d4600e97..fd9789efb6 100644 --- a/security_scanning/examples/models/core/qwen/poetry.lock +++ b/security_scanning/examples/models/core/qwen/poetry.lock @@ -3431,13 +3431,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/qwen2audio/poetry.lock b/security_scanning/examples/models/core/qwen2audio/poetry.lock index 48e847b41e..9505e4f84b 100644 --- a/security_scanning/examples/models/core/qwen2audio/poetry.lock +++ b/security_scanning/examples/models/core/qwen2audio/poetry.lock @@ -1951,13 +1951,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/qwenvl/poetry.lock b/security_scanning/examples/models/core/qwenvl/poetry.lock index a9b9e21c44..47a61b0ca2 100644 --- a/security_scanning/examples/models/core/qwenvl/poetry.lock +++ b/security_scanning/examples/models/core/qwenvl/poetry.lock @@ -3054,13 +3054,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/recurrentgemma/poetry.lock b/security_scanning/examples/models/core/recurrentgemma/poetry.lock index 8187e11a1f..555a7dd417 100644 --- a/security_scanning/examples/models/core/recurrentgemma/poetry.lock +++ b/security_scanning/examples/models/core/recurrentgemma/poetry.lock @@ -2495,13 +2495,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/models/core/whisper/poetry.lock b/security_scanning/examples/models/core/whisper/poetry.lock index a39420fe59..30f628db09 100644 --- a/security_scanning/examples/models/core/whisper/poetry.lock +++ b/security_scanning/examples/models/core/whisper/poetry.lock @@ -2846,13 +2846,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/ngram/poetry.lock b/security_scanning/examples/ngram/poetry.lock index 621b416a3d..241d53f9f8 100644 --- a/security_scanning/examples/ngram/poetry.lock +++ b/security_scanning/examples/ngram/poetry.lock @@ -1810,13 +1810,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/quantization/poetry.lock b/security_scanning/examples/quantization/poetry.lock index e5647a7b82..4418bfbab5 100644 --- a/security_scanning/examples/quantization/poetry.lock +++ b/security_scanning/examples/quantization/poetry.lock @@ -1954,13 +1954,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/redrafter/poetry.lock b/security_scanning/examples/redrafter/poetry.lock index 52ca91a2b6..73c0ed5f5f 100644 --- a/security_scanning/examples/redrafter/poetry.lock +++ b/security_scanning/examples/redrafter/poetry.lock @@ -1820,13 +1820,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/examples/trtllm-eval/poetry.lock b/security_scanning/examples/trtllm-eval/poetry.lock index 970031c4f9..5fa7747a47 100644 --- a/security_scanning/examples/trtllm-eval/poetry.lock +++ b/security_scanning/examples/trtllm-eval/poetry.lock @@ -3253,13 +3253,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index ad9a6f4b94..f160cde0e0 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "461446045e62028ff6bc6516f8e5858e26890742", - "timestamp": "2025-12-13T02:58:07Z" + "commit_hash": "a5a37227d669cabddb074089143334d9bbb69627", + "timestamp": "2025-12-14T03:00:28Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index edc77a01f3..ba7326aabc 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -2044,6 +2044,37 @@ ninja = ["ninja (>=1.8.2)"] progress = ["tqdm"] typing = ["mypy", "typing_extensions"] +[[package]] +name = "mistral-common" +version = "1.8.6" +description = "Mistral-common is a library of common utilities for Mistral AI." +optional = false +python-versions = "<3.14,>=3.10.0" +files = [ + {file = "mistral_common-1.8.6-py3-none-any.whl", hash = "sha256:dd8c0e55b397e8167751eb3da147cf23fd970824673ca0e260aa58c888be1b0a"}, + {file = "mistral_common-1.8.6.tar.gz", hash = "sha256:c61702720093f7a06508e81923917b04e35062b9ff396b8512b9c4d1139767ee"}, +] + +[package.dependencies] +jsonschema = ">=4.21.1" +numpy = ">=1.25" +pillow = ">=10.3.0" +pydantic = ">=2.7,<3.0" +pydantic-extra-types = {version = ">=2.10.5", extras = ["pycountry"]} +requests = ">=2.0.0" +tiktoken = ">=0.7.0" +typing-extensions = ">=4.11.0" + +[package.extras] +audio = ["mistral_common[soundfile]", "mistral_common[soxr]"] +hf-hub = ["huggingface-hub (>=0.32.4)"] +image = ["mistral_common[opencv]"] +opencv = ["opencv-python-headless (>=4.0.0)"] +sentencepiece = ["sentencepiece (>=0.2.0)"] +server = ["click (>=8.1.0)", "fastapi[standard] (>=0.115.12)", "pydantic-settings (>=2.9.1)"] +soundfile = ["soundfile (>=0.12.1)"] +soxr = ["soxr (>=0.5.0)"] + [[package]] name = "ml-dtypes" version = "0.5.4" @@ -3721,6 +3752,17 @@ files = [ {file = "pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9"}, ] +[[package]] +name = "pycountry" +version = "24.6.1" +description = "ISO country, subdivision, language, currency and script definitions and their translations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"}, + {file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"}, +] + [[package]] name = "pycparser" version = "2.23" @@ -3936,6 +3978,31 @@ files = [ [package.dependencies] typing-extensions = ">=4.14.1" +[[package]] +name = "pydantic-extra-types" +version = "2.10.6" +description = "Extra Pydantic types." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_extra_types-2.10.6-py3-none-any.whl", hash = "sha256:6106c448316d30abf721b5b9fecc65e983ef2614399a24142d689c7546cc246a"}, + {file = "pydantic_extra_types-2.10.6.tar.gz", hash = "sha256:c63d70bf684366e6bbe1f4ee3957952ebe6973d41e7802aea0b770d06b116aeb"}, +] + +[package.dependencies] +pycountry = {version = ">=23", optional = true, markers = "extra == \"pycountry\""} +pydantic = ">=2.5.2" +typing-extensions = "*" + +[package.extras] +all = ["cron-converter (>=1.2.2)", "pendulum (>=3.0.0,<4.0.0)", "phonenumbers (>=8,<10)", "pycountry (>=23)", "pymongo (>=4.0.0,<5.0.0)", "python-ulid (>=1,<2)", "python-ulid (>=1,<4)", "pytz (>=2024.1)", "semver (>=3.0.2)", "semver (>=3.0.2,<3.1.0)", "tzdata (>=2024.1)"] +cron = ["cron-converter (>=1.2.2)"] +pendulum = ["pendulum (>=3.0.0,<4.0.0)"] +phonenumbers = ["phonenumbers (>=8,<10)"] +pycountry = ["pycountry (>=23)"] +python-ulid = ["python-ulid (>=1,<2)", "python-ulid (>=1,<4)"] +semver = ["semver (>=3.0.2)"] + [[package]] name = "pydantic-settings" version = "2.12.0" @@ -5353,13 +5420,13 @@ typing-extensions = ">=4.12.0" [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] @@ -5773,4 +5840,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "b0bf1650e7b7f69715b76d39327eb3223e6b6f5833582562d252f4c07d3f3ec9" +content-hash = "53efa4790774d420fa887b4dacb4b87369c00aec7860e1ecd91c5e8680ec3f4a" diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index 274d314db8..1ce8c029c3 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -74,6 +74,7 @@ openai-harmony = "0.0.4" nvidia-cutlass-dsl = "4.3.1" plotly = "^6.5.0" partial-json-parser = "^0.2.1.1.post7" +mistral-common = "1.8.6" [build-system] diff --git a/security_scanning/tests/integration/defs/perf/poetry.lock b/security_scanning/tests/integration/defs/perf/poetry.lock index 811326cd00..55c2560ec2 100644 --- a/security_scanning/tests/integration/defs/perf/poetry.lock +++ b/security_scanning/tests/integration/defs/perf/poetry.lock @@ -666,13 +666,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [metadata] diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock index 6bf91b1756..ed36daa52f 100644 --- a/security_scanning/triton_backend/poetry.lock +++ b/security_scanning/triton_backend/poetry.lock @@ -948,13 +948,13 @@ files = [ [[package]] name = "tzdata" -version = "2025.2" +version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" files = [ - {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, - {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] [[package]] From fcda1a1442e53612ee3194f5aa6e932dcc22abda Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Sun, 14 Dec 2025 12:22:36 +0800 Subject: [PATCH 122/172] [None][fix] disable async pp send for ray cases. (#9959) Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- jenkins/L0_MergeRequest.groovy | 1 + tensorrt_llm/_torch/distributed/communicator.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index deaa59fc26..60f48063ee 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -712,6 +712,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars) "tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py", "tensorrt_llm/_torch/custom_ops/torch_custom_ops.py", "tensorrt_llm/_torch/custom_ops/userbuffers_custom_ops.py", + "tensorrt_llm/_torch/distributed/", "tensorrt_llm/_torch/models/modeling_llama.py", "tensorrt_llm/_torch/models/modeling_qwen3_next.py", "tensorrt_llm/_torch/modules/fused_moe/", diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py index 5e4968f298..93457691bd 100644 --- a/tensorrt_llm/_torch/distributed/communicator.py +++ b/tensorrt_llm/_torch/distributed/communicator.py @@ -856,6 +856,13 @@ class PPCommTorch(PPCommBase): def direct_send(self, tensor: torch.Tensor, dest: int): self.pg.send([tensor], self._global_to_local_rank(dest), tag=0).wait() + # TODO: support async pp send for PPCommTorch + def send(self, tensor: torch.Tensor, dest: Optional[int] = None): + if dest is None: + dest = self.mapping.next_pp_rank() + + self.pg.send([tensor], self._global_to_local_rank(dest), tag=0).wait() + def recv(self, tensor: torch.Tensor, src: Optional[int] = None): if src is None: src = self.mapping.prev_pp_rank() From 96d654029d0df37a2b0c7d4599678daebf7bc0b1 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Sun, 14 Dec 2025 02:07:35 -0500 Subject: [PATCH 123/172] [https://nvbugs/5666816][fix] Unwaive llama3 eagle3 test (#9964) Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 08c35dcf4f..a79bc638c0 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -365,7 +365,6 @@ unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_ test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153) accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438) accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721) -unittest/_torch/speculative/test_eagle3.py::test_llama_eagle3[True-FLASHINFER-False-False-False-False-True-False-False] SKIP (https://nvbugs/5691246) test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) examples/test_ray.py::test_llm_inference_distributed_ray[tep2] SKIP (https://nvbugs/5701457) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457) From 1375910f1b7707684d3fc15383a2802b1d7dc64c Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Sun, 14 Dec 2025 00:09:33 -0800 Subject: [PATCH 124/172] [None][infra] Delete container before attempting import (#9967) Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 071cf5fe6f..73c4cd16c2 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1021,27 +1021,31 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG srunPrologue = """ export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot' - retry_command() { - local cmd=\$1 - local max_attempts=\${2:-3} - local delay=\${3:-60} + importContainerWithRetries() { + local docker_uri=\$1 + local output_path=\$2 + local max_attempts=\${3:-3} + local delay=\${4:-60} local attempt=1 - until \$cmd + rm -f "\$output_path" + + until enroot import -o "\$output_path" -- "docker://\$docker_uri" do if ((attempt >= max_attempts)) then - echo "Command '\$cmd' failed after \$max_attempts attempts" + echo "enroot import failed after \$max_attempts attempts" return 1 fi - echo "Command '\$cmd' failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..." + echo "enroot import failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..." + rm -f "\$output_path" sleep \$delay ((attempt++)) done } - retry_command "enroot import -o $enrootImagePath -- docker://$container" + importContainerWithRetries "$container" "$enrootImagePath" """.replaceAll("(?m)^\\s*", "") } From e0a4b722796c31ad0e34c779355547710b8e231e Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Sun, 14 Dec 2025 22:48:34 +0800 Subject: [PATCH 125/172] [None][infra] Waive failed tests for main branch on 12/14 (#9982) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index a79bc638c0..8eb0eef98d 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -423,3 +423,20 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_mode disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740087) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740087) +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075) +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740359) +unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981) +unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[True] SKIP (https://nvbugs/5739981) +unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_serve[True] SKIP (https://nvbugs/5739981) From 9a1750c8f9839193e0c849bd9cfcd7d9c4203a1a Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Sun, 14 Dec 2025 11:29:30 -0800 Subject: [PATCH 126/172] [TRTLLM-9493][noop] Refactor fusedMoeCommKernels to enable code sharing (#9922) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/cudaAsyncOps.cuh | 218 +++++++++++++ .../kernels/fusedMoeCommKernels.cu | 305 ++---------------- .../kernels/fusedMoeCommKernels.h | 16 +- cpp/tensorrt_llm/kernels/ll128Proto.cuh | 163 ++++++++++ .../kernels/moeCommKernelsCommon.h | 57 ++++ 5 files changed, 470 insertions(+), 289 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/cudaAsyncOps.cuh create mode 100644 cpp/tensorrt_llm/kernels/ll128Proto.cuh diff --git a/cpp/tensorrt_llm/kernels/cudaAsyncOps.cuh b/cpp/tensorrt_llm/kernels/cudaAsyncOps.cuh new file mode 100644 index 0000000000..0e1c879066 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/cudaAsyncOps.cuh @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include "tensorrt_llm/kernels/moeCommKernelsCommon.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +// ============================================================================ +// Address Conversion Utilities +// ============================================================================ + +static __device__ __forceinline__ uint32_t __as_ptr_smem(void const* __ptr) +{ + // Consider adding debug asserts here. + return static_cast(__cvta_generic_to_shared(__ptr)); +} + +static __device__ __forceinline__ uint64_t __as_ptr_gmem(void const* __ptr) +{ + // Consider adding debug asserts here. + return static_cast(__cvta_generic_to_global(__ptr)); +} + +// ============================================================================ +// Memory Fence Operations +// ============================================================================ + +__device__ __forceinline__ void fence_release_sys() +{ + asm volatile("fence.release.sys;" : : : "memory"); +} + +// ============================================================================ +// Memory Barrier Operations (mbarrier) +// ============================================================================ + +__device__ __forceinline__ void mbarrier_init(uint64_t* addr, uint32_t const& count) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 + asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(addr)), "r"(count) : "memory"); +#endif +} + +__device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_t txCount) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(addr)), "r"(txCount) + : "memory"); +#endif +} + +__device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 + uint64_t state; + asm("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(state) : "r"(__as_ptr_smem(addr)) : "memory"); + return state; +#else + return 0; +#endif +} + +__device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, const uint32_t txCount) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + uint64_t state; + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(state) + : "r"(__as_ptr_smem(addr)), "r"(txCount) + : "memory"); + return state; +#else + return 0; +#endif +} + +__device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_t const& phaseParity) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + uint32_t waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(waitComplete) + : "r"(__as_ptr_smem(addr)), "r"(phaseParity) + : "memory"); + return static_cast(waitComplete); +#else + return false; +#endif +} + +// ============================================================================ +// Async Copy Operations (cp.async for SM80+) +// ============================================================================ + +template +__device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool predGuard) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.ca.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int) predGuard), + "r"(__as_ptr_smem(dstShm)), "l"(__as_ptr_gmem(srcMem)), "n"(COPY_SIZE)); +#endif +} + +__device__ __forceinline__ void cp_async_commit_group() +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.commit_group;" : : :); +#endif +} + +template +__device__ __forceinline__ void cp_async_wait_group() +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_group %0;" : : "n"(N) : "memory"); +#endif +} + +// ============================================================================ +// Bulk Async Copy Operations (cp.async.bulk for SM90+) +// ============================================================================ + +__device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcMem, int copySize, uint64_t* smemBar) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(dstMem)), "l"(__as_ptr_gmem(srcMem)), "r"(copySize), "r"(__as_ptr_smem(smemBar)) + : "memory"); +#endif +} + +__device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcMem, int copySize) +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(dstMem)), "r"(__as_ptr_smem(srcMem)), "r"(copySize) + : "memory"); +#endif +} + +__device__ __forceinline__ void cp_async_bulk_commit_group() +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.commit_group;" : : :); +#endif +} + +template +__device__ __forceinline__ void cp_async_bulk_wait_group() +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group %0;" : : "n"(N) : "memory"); +#endif +} + +template +__device__ __forceinline__ void cp_async_bulk_wait_group_read() +{ +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(N) : "memory"); +#endif +} + +// ============================================================================ +// Shared Memory Barrier Helpers +// ============================================================================ + +__device__ __forceinline__ void initSmemBar(uint64_t* smemBar, int laneId) +{ + if (laneId == 0) + { + mbarrier_init(smemBar, WARP_SIZE); + } + __syncwarp(); +} + +__device__ __forceinline__ void smemBarWait(uint64_t* smemBar, uint32_t* phaseParity) +{ + while (!mbarrier_try_wait_parity(smemBar, *phaseParity)) + { + } + *phaseParity = 1 - *phaseParity; +} + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu index 633b276b12..a36fb617b9 100644 --- a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu +++ b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu @@ -20,7 +20,9 @@ #include #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/cudaAsyncOps.cuh" #include "tensorrt_llm/kernels/fusedMoeCommKernels.h" +#include "tensorrt_llm/kernels/ll128Proto.cuh" #include "tensorrt_llm/kernels/quantization.cuh" TRTLLM_NAMESPACE_BEGIN @@ -337,154 +339,6 @@ __device__ __forceinline__ void dequantize_nvfp4_sharedmem(uint8_t* compact_ptr, #endif } -static __device__ __forceinline__ uint32_t __as_ptr_smem(void const* __ptr) -{ - // Consider adding debug asserts here. - return static_cast(__cvta_generic_to_shared(__ptr)); -} - -static __device__ __forceinline__ uint64_t __as_ptr_gmem(void const* __ptr) -{ - // Consider adding debug asserts here. - return static_cast(__cvta_generic_to_global(__ptr)); -} - -__device__ __forceinline__ void fence_release_sys() -{ - asm volatile("fence.release.sys;" : : : "memory"); -} - -__device__ __forceinline__ void mbarrier_init(uint64_t* addr, uint32_t const& count) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 - asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(addr)), "r"(count) : "memory"); -#endif -} - -__device__ __forceinline__ void mbarrier_expect_tx(uint64_t* addr, const uint32_t txCount) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(addr)), "r"(txCount) - : "memory"); -#endif -} - -__device__ __forceinline__ uint64_t mbarrier_arrive(uint64_t* addr) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 - uint64_t state; - asm("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(state) : "r"(__as_ptr_smem(addr)) : "memory"); - return state; -#else - return 0; -#endif -} - -__device__ __forceinline__ uint64_t mbarrier_arrive_expect_tx(uint64_t* addr, const uint32_t txCount) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - uint64_t state; - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;" - : "=l"(state) - : "r"(__as_ptr_smem(addr)), "r"(txCount) - : "memory"); - return state; -#else - return 0; -#endif -} - -__device__ __forceinline__ bool mbarrier_try_wait_parity(uint64_t* addr, uint32_t const& phaseParity) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - uint32_t waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(waitComplete) - : "r"(__as_ptr_smem(addr)), "r"(phaseParity) - : "memory"); - return static_cast(waitComplete); -#else - return false; -#endif -} - -template -__device__ __forceinline__ void ldgsts(int* dstShm, int const* srcMem, bool predGuard) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 - asm volatile( - "{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.ca.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int) predGuard), - "r"(__as_ptr_smem(dstShm)), "l"(__as_ptr_gmem(srcMem)), "n"(COPY_SIZE)); -#endif -} - -__device__ __forceinline__ void cp_async_commit_group() -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 - asm volatile("cp.async.commit_group;" : : :); -#endif -} - -template -__device__ __forceinline__ void cp_async_wait_group() -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 800 - asm volatile("cp.async.wait_group %0;" : : "n"(N) : "memory"); -#endif -} - -__device__ __forceinline__ void cp_async_bulk_g2s(void* dstMem, void const* srcMem, int copySize, uint64_t* smemBar) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" - : - : "r"(__as_ptr_smem(dstMem)), "l"(__as_ptr_gmem(srcMem)), "r"(copySize), "r"(__as_ptr_smem(smemBar)) - : "memory"); -#endif -} - -__device__ __forceinline__ void cp_async_bulk_s2g(void* dstMem, void const* srcMem, int copySize) -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(dstMem)), "r"(__as_ptr_smem(srcMem)), "r"(copySize) - : "memory"); -#endif -} - -__device__ __forceinline__ void cp_async_bulk_commit_group() -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - asm volatile("cp.async.bulk.commit_group;" : : :); -#endif -} - -template -__device__ __forceinline__ void cp_async_bulk_wait_group() -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - asm volatile("cp.async.bulk.wait_group %0;" : : "n"(N) : "memory"); -#endif -} - -template -__device__ __forceinline__ void cp_async_bulk_wait_group_read() -{ -#if defined(__CUDACC__) && __CUDA_ARCH__ >= 900 - asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(N) : "memory"); -#endif -} - __host__ void MoeCommFieldInfo::fillFieldInfo( uint8_t* dataPtr, size_t elementSize, int vectorSize, int stride, cudaDataType_t dataType) { @@ -527,143 +381,47 @@ __host__ void MoeCommFieldInfo::fillFieldInfo( originalDataType = dataType; } -class Ll128Proto +// Wrapper class that delegates to LL128Proto but accepts extra warpId parameter for backward compatibility +class Ll128ProtoWrapper { public: - static constexpr uint32_t INITIALIZED_VALUE = 0xFFFFFFFFU; + static constexpr uint32_t INITIALIZED_VALUE = LL128Proto::INITIALIZED_VALUE; template static __device__ __forceinline__ int checkDataReceivedInShm(uint8_t* sharedMemoryBase, uint64_t step, - int countIn128Bytes, int fifoEntry128ByteIndexBase, int loaded128ByteCount, int warpId, int laneId) + int countIn128Bytes, int fifoEntry128ByteIndexBase, int loaded128ByteCount, int /*warpId*/, int laneId) { - // return value should be how many package already been received. - // 0 means no data received, -1 means has received finish package(should be the very first 128 Byte). - uint64_t* aligned128BytesShm = reinterpret_cast(sharedMemoryBase); - int totalValidCount = 0; - for (int idxBase = loaded128ByteCount; idxBase < countIn128Bytes; idxBase += WARP_SIZE) - { - int idx = idxBase + laneId; - bool valid = false; - bool finish = false; - if (idx < countIn128Bytes) - { - int indexInFifoEntry = fifoEntry128ByteIndexBase + idx; - uint64_t value = aligned128BytesShm[idx * MoeCommFieldInfo::UINT64_PER_128B_BLOCK - + indexInFifoEntry % MoeCommFieldInfo::UINT64_PER_128B_BLOCK]; - if (USE_FINISH) - { - finish = (value == (step & (1ULL << 63ULL))); - valid = (value == step) || finish; - } - else - { - valid = (value == step); - } - } - __syncwarp(); - unsigned validMask = __ballot_sync(WARP_MASK, valid); - // here we check valid in order, if previous valid is not true, we ignore the current valid. - int validCount = (validMask == WARP_MASK) ? WARP_SIZE : (__ffs(~validMask) - 1); - if (USE_FINISH) - { - unsigned finishedMask = __ballot_sync(WARP_MASK, finish); - // finish should be the very first 128 Byte. - if (finishedMask & 0x1) - { - return -1; - } - } - totalValidCount += validCount; - - if (validCount != WARP_SIZE) - { - break; - } - } - return totalValidCount; + return LL128Proto::checkDataReceivedInShm( + sharedMemoryBase, step, countIn128Bytes, fifoEntry128ByteIndexBase, loaded128ByteCount, laneId); } static __device__ __forceinline__ void protoPack(uint8_t* sharedMemoryBase, uint64_t step, int countIn128Bytes, - int fifoEntry128ByteIndexBase, int warpId, int laneId) + int fifoEntry128ByteIndexBase, int /*warpId*/, int laneId) { - uint64_t* aligned128BytesShm = reinterpret_cast(sharedMemoryBase); - int halfLaneId = laneId % 16; - int halfIndex = laneId / 16; - int tailOffsetIn128Bytes = countIn128Bytes + halfIndex; - // for LL128 15 * 128 Bytes will be packed to 16 * 128 Bytes, each 16 threads is used for one 15 * 128 bytes. - for (int idxIn128BytesBase = halfIndex * 15; idxIn128BytesBase < countIn128Bytes; idxIn128BytesBase += 30) - { - int tailFlagIndexFromFifoEntry = fifoEntry128ByteIndexBase + tailOffsetIn128Bytes; - int tailFlagInnerIndex = tailFlagIndexFromFifoEntry % MoeCommFieldInfo::UINT64_PER_128B_BLOCK; - int idxIn128Bytes = idxIn128BytesBase + halfLaneId; - int idxFromFifoEntry = fifoEntry128ByteIndexBase + idxIn128Bytes; - uint64_t tailValue = step; - uint64_t tailInnerIndex = (halfLaneId >= tailFlagInnerIndex) ? halfLaneId + 1 : halfLaneId; - if (halfLaneId == 15) - { - tailInnerIndex = tailFlagInnerIndex; - } - int targetTailIndex = tailOffsetIn128Bytes * MoeCommFieldInfo::UINT64_PER_128B_BLOCK + tailInnerIndex; - if (idxIn128Bytes < countIn128Bytes && halfLaneId < 15) - { - int flagIndex = idxIn128Bytes * MoeCommFieldInfo::UINT64_PER_128B_BLOCK - + idxFromFifoEntry % MoeCommFieldInfo::UINT64_PER_128B_BLOCK; - tailValue = aligned128BytesShm[flagIndex]; - aligned128BytesShm[flagIndex] = step; - } - aligned128BytesShm[targetTailIndex] = tailValue; - tailOffsetIn128Bytes += 2; - } - __syncwarp(); + LL128Proto::protoPack(sharedMemoryBase, step, countIn128Bytes, fifoEntry128ByteIndexBase, laneId); } static __device__ __forceinline__ void protoUnpack(uint8_t* sharedMemoryBase, uint64_t step, int countIn128Bytes, - int fifoEntry128ByteIndexBase, int loaded128ByteCount, int warpId, int laneId) + int fifoEntry128ByteIndexBase, int loaded128ByteCount, int /*warpId*/, int laneId) { - uint64_t* aligned128BytesShm = reinterpret_cast(sharedMemoryBase); - int halfLaneId = laneId % 16; - int halfIndex = laneId / 16; - int tailOffsetIn128Bytes = countIn128Bytes + halfIndex; - for (int idxIn128BytesBase = halfIndex * 15; idxIn128BytesBase < countIn128Bytes; idxIn128BytesBase += 30) - { - int tailFlagIndexFromFifoEntry = fifoEntry128ByteIndexBase + tailOffsetIn128Bytes; - int tailFlagInnerIndex = tailFlagIndexFromFifoEntry % MoeCommFieldInfo::UINT64_PER_128B_BLOCK; - int idxIn128Bytes = idxIn128BytesBase + halfLaneId; - int idxFromFifoEntry = fifoEntry128ByteIndexBase + idxIn128Bytes; - uint64_t tailValue = 0; - int tailInnerIndex = (halfLaneId >= tailFlagInnerIndex) ? halfLaneId + 1 : halfLaneId; - int targetTailIndex = tailOffsetIn128Bytes * MoeCommFieldInfo::UINT64_PER_128B_BLOCK + tailInnerIndex; - if (halfLaneId < 15) - { - tailValue = aligned128BytesShm[targetTailIndex]; - } - if (idxIn128Bytes < countIn128Bytes && halfLaneId < 15) - { - int flagIndex = idxIn128Bytes * MoeCommFieldInfo::UINT64_PER_128B_BLOCK - + idxFromFifoEntry % MoeCommFieldInfo::UINT64_PER_128B_BLOCK; - aligned128BytesShm[flagIndex] = tailValue; - } - tailOffsetIn128Bytes += 2; - } - __syncwarp(); + LL128Proto::protoUnpack( + sharedMemoryBase, step, countIn128Bytes, fifoEntry128ByteIndexBase, loaded128ByteCount, laneId); } - static __device__ __forceinline__ void rearm( - uint32_t* u32FifoPtr, uint64_t step, int countIn128Bytes, int fifoEntry128ByteIndexBase, int warpId, int laneId) + static __device__ __forceinline__ void rearm(uint32_t* u32FifoPtr, uint64_t step, int countIn128Bytes, + int fifoEntry128ByteIndexBase, int /*warpId*/, int laneId) { - // LL128 don't need rearm + LL128Proto::rearm(u32FifoPtr, step, countIn128Bytes, fifoEntry128ByteIndexBase, laneId); } static __device__ __host__ __forceinline__ int computeProtoTransfer128ByteAlignedSize( int compact128ByteSizeBeforeProto) { - // each 15 * 128 byte need one tail 128 byte - int tail128ByteSize = (compact128ByteSizeBeforeProto + 15 * 128 - 1) / (15 * 128) * 128; - return compact128ByteSizeBeforeProto + tail128ByteSize; + return LL128Proto::computeProtoTransfer128ByteAlignedSize(compact128ByteSizeBeforeProto); } }; -using FusedMoeProto = Ll128Proto; +using FusedMoeProto = Ll128ProtoWrapper; // using FusedMoeProto = LamportProto; @@ -797,23 +555,6 @@ __device__ __forceinline__ void unpackAllFields( __syncwarp(); } -__device__ __forceinline__ void initSmemBar(uint64_t* smemBar, int laneId) -{ - if (laneId == 0) - { - mbarrier_init(smemBar, WARP_SIZE); - } - __syncwarp(); -} - -__device__ __forceinline__ void smemBarWait(uint64_t* smemBar, uint32_t* phaseParity) -{ - while (!mbarrier_try_wait_parity(smemBar, *phaseParity)) - { - } - *phaseParity = 1 - *phaseParity; -} - __device__ __forceinline__ void startWorkspaceS2G( uint64_t* fifoEntry, uint8_t* sharedMemoryBase, int send128ByteCount, int fifo128ByteOffset, int warpId, int laneId) { @@ -901,7 +642,7 @@ __device__ __forceinline__ void waitG2SBasicFields() __device__ __forceinline__ void waitG2SOtherFields(uint64_t* memBar, uint32_t* phaseParity) { - tensorrt_llm::kernels::fused_moe_impl::smemBarWait(memBar, phaseParity); + smemBarWait(memBar, phaseParity); } template @@ -988,7 +729,7 @@ public: mFifoEntry128ByteIndexBase = kFifoEntry128ByteCount; mFifoEntryIndex = -1; - tensorrt_llm::kernels::fused_moe_impl::initSmemBar(mSmemBar, mLaneId); + initSmemBar(mSmemBar, mLaneId); } __device__ __forceinline__ uint64_t* getFifoEntryPtr() const @@ -1175,7 +916,7 @@ public: updateReadEntry(); needRelease = false; } - tensorrt_llm::kernels::fused_moe_impl::smemBarWait(mSmemBar, &phaseParity); + smemBarWait(mSmemBar, &phaseParity); loaded128ByteCount += FusedMoeProto::template checkDataReceivedInShm(mShmemBase, mTail, mSingleTransfer128ByteCount, mFifoEntry128ByteIndexBase, loaded128ByteCount, mWarpId, mLaneId); } @@ -1521,7 +1262,7 @@ __global__ void g2sKernel(FusedMoeFieldInfo allFieldInfo, MoeExpertParallelInfo int singleShmSize = singleCommMeta.singleUncompactAlignedSize; - tensorrt_llm::kernels::fused_moe_impl::initSmemBar(&allWarpSmemBar[warpId], laneId); + initSmemBar(&allWarpSmemBar[warpId], laneId); uint32_t phaseParity = 0; uint8_t* sharedMemoryBase = reinterpret_cast(allWarpShm) + singleShmSize * warpId; @@ -1632,7 +1373,7 @@ __global__ void loopbackKernel(FusedMoeFieldInfo sendFieldInfo, FusedMoeFieldInf int recvTokenIndex = recvIndexMapping[tokenIndex]; - tensorrt_llm::kernels::fused_moe_impl::initSmemBar(&allWarpSmemBar[warpId], laneId); + initSmemBar(&allWarpSmemBar[warpId], laneId); uint32_t phaseParity = 0; int singleShmSize = sendCommMeta.getSingleShmSize(); diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h index 31aab22507..1a6dfe6a0a 100644 --- a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h +++ b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h @@ -83,11 +83,11 @@ struct MoeCommFieldInfo static constexpr uint64_t kAlign16BytePtrMask = (1ULL << 4) - 1; static constexpr uint32_t kAligned16BMask = (1 << 4) - 1; - // Constants for memory alignment and access - static constexpr int BYTES_PER_128B_BLOCK = 128; - static constexpr int INTS_PER_128B_BLOCK = BYTES_PER_128B_BLOCK / sizeof(int); - static constexpr int UINT64_PER_128B_BLOCK = BYTES_PER_128B_BLOCK / sizeof(uint64_t); - static constexpr int BYTES_PER_16B_BLOCK = 16; + // Constants for memory alignment and access (reference common constants for consistency) + static constexpr int BYTES_PER_128B_BLOCK = tensorrt_llm::kernels::BYTES_PER_128B_BLOCK; + static constexpr int INTS_PER_128B_BLOCK = tensorrt_llm::kernels::INTS_PER_128B_BLOCK; + static constexpr int UINT64_PER_128B_BLOCK = tensorrt_llm::kernels::UINT64_PER_128B_BLOCK; + static constexpr int BYTES_PER_16B_BLOCK = tensorrt_llm::kernels::BYTES_PER_16B_BLOCK; // Will pad one 16 byte for each unaligned field, then head and tail 16 byte might not be aligned // Fill single field info, the fields that need global info is not filled here. @@ -253,9 +253,11 @@ public: static constexpr int FIFO_ENTRY_128_BYTE_COUNT = FIFO_ENTRY_BYTES / 128; static constexpr int FIFO_TOTAL_BYTES = FIFO_ENTRY_BYTES * FIFO_DEPTH; static constexpr int FIFO_TOTAL_U64 = FIFO_TOTAL_BYTES / sizeof(uint64_t); - static constexpr int MAX_GROUP_COUNT_PER_BLOCK = 8; + // Reference common constant for consistency + static constexpr int MAX_GROUP_COUNT_PER_BLOCK = tensorrt_llm::kernels::MAX_GROUP_COUNT_PER_BLOCK; - static constexpr int WARP_SIZE = 32; + // Reference common constant for consistency + static constexpr int WARP_SIZE = tensorrt_llm::kernels::WARP_SIZE; static int maxSmCount; static bool maxSmCountUsed; diff --git a/cpp/tensorrt_llm/kernels/ll128Proto.cuh b/cpp/tensorrt_llm/kernels/ll128Proto.cuh new file mode 100644 index 0000000000..6ef51b01f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/ll128Proto.cuh @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include "tensorrt_llm/kernels/moeCommKernelsCommon.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +class LL128Proto +{ +public: + static constexpr uint32_t INITIALIZED_VALUE = 0xFFFFFFFFU; + + template + static __device__ __forceinline__ int checkDataReceivedInShm(uint8_t* sharedMemoryBase, uint64_t step, + int countIn128Bytes, int fifoEntry128ByteIndexBase, int loaded128ByteCount, int laneId) + { + // return value should be how many package already been received. + // 0 means no data received, -1 means has received finish package(should be the very first 128 Byte). + uint64_t* aligned128BytesShm = reinterpret_cast(sharedMemoryBase); + int totalValidCount = 0; + for (int idxBase = loaded128ByteCount; idxBase < countIn128Bytes; idxBase += WARP_SIZE) + { + int idx = idxBase + laneId; + bool valid = false; + bool finish = false; + if (idx < countIn128Bytes) + { + int indexInFifoEntry = fifoEntry128ByteIndexBase + idx; + uint64_t value + = aligned128BytesShm[idx * UINT64_PER_128B_BLOCK + indexInFifoEntry % UINT64_PER_128B_BLOCK]; + if (USE_FINISH) + { + finish = (value == (step & (1ULL << 63ULL))); + valid = (value == step) || finish; + } + else + { + valid = (value == step); + } + } + __syncwarp(); + unsigned validMask = __ballot_sync(WARP_MASK, valid); + // here we check valid in order, if previous valid is not true, we ignore the current valid. + int validCount = (validMask == WARP_MASK) ? WARP_SIZE : (__ffs(~validMask) - 1); + if (USE_FINISH) + { + unsigned finishedMask = __ballot_sync(WARP_MASK, finish); + // finish should be the very first 128 Byte. + if (finishedMask & 0x1) + { + return -1; + } + } + totalValidCount += validCount; + + if (validCount != WARP_SIZE) + { + break; + } + } + return totalValidCount; + } + + static __device__ __forceinline__ void protoPack( + uint8_t* sharedMemoryBase, uint64_t step, int countIn128Bytes, int fifoEntry128ByteIndexBase, int laneId) + { + uint64_t* aligned128BytesShm = reinterpret_cast(sharedMemoryBase); + int halfLaneId = laneId % 16; + int halfIndex = laneId / 16; + int tailOffsetIn128Bytes = countIn128Bytes + halfIndex; + // for LL128 15 * 128 Bytes will be packed to 16 * 128 Bytes, each 16 threads is used for one 15 * 128 bytes. + for (int idxIn128BytesBase = halfIndex * 15; idxIn128BytesBase < countIn128Bytes; idxIn128BytesBase += 30) + { + int tailFlagIndexFromFifoEntry = fifoEntry128ByteIndexBase + tailOffsetIn128Bytes; + int tailFlagInnerIndex = tailFlagIndexFromFifoEntry % UINT64_PER_128B_BLOCK; + int idxIn128Bytes = idxIn128BytesBase + halfLaneId; + int idxFromFifoEntry = fifoEntry128ByteIndexBase + idxIn128Bytes; + uint64_t tailValue = step; + uint64_t tailInnerIndex = (halfLaneId >= tailFlagInnerIndex) ? halfLaneId + 1 : halfLaneId; + if (halfLaneId == 15) + { + tailInnerIndex = tailFlagInnerIndex; + } + int targetTailIndex = tailOffsetIn128Bytes * UINT64_PER_128B_BLOCK + tailInnerIndex; + if (idxIn128Bytes < countIn128Bytes && halfLaneId < 15) + { + int flagIndex = idxIn128Bytes * UINT64_PER_128B_BLOCK + idxFromFifoEntry % UINT64_PER_128B_BLOCK; + tailValue = aligned128BytesShm[flagIndex]; + aligned128BytesShm[flagIndex] = step; + } + aligned128BytesShm[targetTailIndex] = tailValue; + tailOffsetIn128Bytes += 2; + } + __syncwarp(); + } + + static __device__ __forceinline__ void protoUnpack(uint8_t* sharedMemoryBase, uint64_t step, int countIn128Bytes, + int fifoEntry128ByteIndexBase, int loaded128ByteCount, int laneId) + { + uint64_t* aligned128BytesShm = reinterpret_cast(sharedMemoryBase); + int halfLaneId = laneId % 16; + int halfIndex = laneId / 16; + int tailOffsetIn128Bytes = countIn128Bytes + halfIndex; + for (int idxIn128BytesBase = halfIndex * 15; idxIn128BytesBase < countIn128Bytes; idxIn128BytesBase += 30) + { + int tailFlagIndexFromFifoEntry = fifoEntry128ByteIndexBase + tailOffsetIn128Bytes; + int tailFlagInnerIndex = tailFlagIndexFromFifoEntry % UINT64_PER_128B_BLOCK; + int idxIn128Bytes = idxIn128BytesBase + halfLaneId; + int idxFromFifoEntry = fifoEntry128ByteIndexBase + idxIn128Bytes; + uint64_t tailValue = 0; + int tailInnerIndex = (halfLaneId >= tailFlagInnerIndex) ? halfLaneId + 1 : halfLaneId; + int targetTailIndex = tailOffsetIn128Bytes * UINT64_PER_128B_BLOCK + tailInnerIndex; + if (halfLaneId < 15) + { + tailValue = aligned128BytesShm[targetTailIndex]; + } + if (idxIn128Bytes < countIn128Bytes && halfLaneId < 15) + { + int flagIndex = idxIn128Bytes * UINT64_PER_128B_BLOCK + idxFromFifoEntry % UINT64_PER_128B_BLOCK; + aligned128BytesShm[flagIndex] = tailValue; + } + tailOffsetIn128Bytes += 2; + } + __syncwarp(); + } + + static __device__ __forceinline__ void rearm( + uint32_t* u32FifoPtr, uint64_t step, int countIn128Bytes, int fifoEntry128ByteIndexBase, int laneId) + { + // LL128 don't need rearm + } + + static __device__ __host__ __forceinline__ int computeProtoTransfer128ByteAlignedSize( + int compact128ByteSizeBeforeProto) + { + // each 15 * 128 byte need one tail 128 byte + int tail128ByteSize = (compact128ByteSizeBeforeProto + 15 * 128 - 1) / (15 * 128) * 128; + return compact128ByteSizeBeforeProto + tail128ByteSize; + } +}; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h b/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h index 0993c987e6..a0473e6d3b 100644 --- a/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h +++ b/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h @@ -23,15 +23,72 @@ TRTLLM_NAMESPACE_BEGIN namespace kernels { +// ============================================================================ +// Alignment Macro +// ============================================================================ + #ifdef __CUDACC__ #define ALIGN_256 __align__(256) #else #define ALIGN_256 alignas(256) #endif +// ============================================================================ +// Warp Constants +// ============================================================================ + constexpr int WARP_SIZE = 32; constexpr uint32_t WARP_MASK = 0xffffffff; +// ============================================================================ +// Memory Block Constants +// ============================================================================ + +// Size of a 128-byte aligned block (used for bulk async copies) +constexpr int BYTES_PER_128B_BLOCK = 128; + +// Size of a 16-byte aligned block (used for field alignment) +constexpr int BYTES_PER_16B_BLOCK = 16; + +// Number of int elements per 128-byte block +constexpr int INTS_PER_128B_BLOCK = BYTES_PER_128B_BLOCK / sizeof(int); + +// Number of uint64_t elements per 128-byte block +constexpr int UINT64_PER_128B_BLOCK = BYTES_PER_128B_BLOCK / sizeof(uint64_t); + +// ============================================================================ +// Block Organization Constants +// ============================================================================ + +// Maximum number of groups (warps) per CTA for MoE communication kernels +constexpr int MAX_GROUP_COUNT_PER_BLOCK = 8; + +// ============================================================================ +// Utility Functions +// ============================================================================ + +/** + * Ceiling division: compute ceil(a / b) for integers + */ +template +inline constexpr T ceil_div(T a, T b) +{ + return (a + b - 1) / b; +} + +/** + * Align value up to nearest multiple of alignment + */ +template +inline constexpr T align_up(T value, T alignment) +{ + return ceil_div(value, alignment) * alignment; +} + +// ============================================================================ +// MoE Parallel Info Structures +// ============================================================================ + struct MoeEpWorldInfo { int epSize; From f21e2b3329421a0f4a56a983d97bd211d6e37be1 Mon Sep 17 00:00:00 2001 From: Simeng Liu <109828133+SimengLiu-nv@users.noreply.github.com> Date: Sun, 14 Dec 2025 16:42:30 -0800 Subject: [PATCH 127/172] [TRTLLM-9601][feat] Expose mmKeys for multimodal to integrate with dynamo. (#9604) Signed-off-by: SimengLiu-nv --- .../batch_manager/kvCacheManager.h | 6 +- cpp/include/tensorrt_llm/executor/executor.h | 12 +- .../batch_manager/kvCacheEventManager.cpp | 2 +- .../batch_manager/kvCacheManager.cpp | 5 + cpp/tensorrt_llm/executor/serialization.cpp | 5 +- .../nanobind/executor/bindings.cpp | 17 ++- cpp/tensorrt_llm/pybind/executor/bindings.cpp | 17 ++- examples/llm-api/quickstart_advanced.py | 14 ++- examples/llm-api/quickstart_multimodal.py | 18 ++- tensorrt_llm/_utils.py | 28 ++++- tests/integration/test_lists/waives.txt | 1 + .../multimodal/test_mm_encoder_standalone.py | 83 ++++++++++++++ .../llmapi/test_llm_kv_cache_events.py | 106 ++++++++++++++++++ 13 files changed, 303 insertions(+), 11 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 70df824ee8..476b53b243 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -78,9 +78,7 @@ using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens; using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType; using BlocksPerWindow = std::map>; using CacheSaltIDType = tensorrt_llm::runtime::CacheSaltIDType; - -// Type alias for multimodal hash key (hash array + start offset) -using MmKey = std::pair, SizeType32>; +using MmKey = tensorrt_llm::executor::MmKey; template using OptionalRef = tensorrt_llm::common::OptionalRef; @@ -325,6 +323,8 @@ public: size_t getHash() const; + std::vector getExtraKeys() const; + private: // Linear ID of block independent of pool IdType mBlockId; diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index 1217a3729a..dda8f52cc8 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -47,6 +47,12 @@ class BaseKVCacheManager; namespace tensorrt_llm::executor { +using SizeType32 = tensorrt_llm::runtime::SizeType32; +// Mmkey is used in KVCacheBlock when multimodal data presents in a block. +// Type alias for hash array + start offset at per-block granularity. +// This differs from the per-request level multimodal hash in MultimodalInput. +using MmKey = std::pair, SizeType32>; + /// @brief Version of TRT-LLM char const* version() noexcept; @@ -1691,12 +1697,14 @@ struct KVCacheStoredBlockData { KVCacheStoredBlockData(IdType blockHash, tensorrt_llm::runtime::VecUniqueTokens tokens, - std::optional loraId, SizeType32 cacheLevel, SizeType32 priority) + std::optional loraId, SizeType32 cacheLevel, SizeType32 priority, + std::vector mmKeys = {}) : blockHash{blockHash} , tokens{std::move(tokens)} , loraId{loraId} , cacheLevel{cacheLevel} , priority{priority} + , mmKeys{std::move(mmKeys)} { } @@ -1710,6 +1718,8 @@ struct KVCacheStoredBlockData SizeType32 cacheLevel; /// @brief The priority of the block SizeType32 priority; + /// @brief The multimodal keys of the block + std::vector mmKeys; }; struct KVCacheStoredData diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp index 9babb73fa4..593b5e826c 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp @@ -102,7 +102,7 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector const& blocks for (auto const& block : blocks) { data.blocks.emplace_back(block->getHash(), block->getUniqueTokens(), block->getBlockKey().loraTaskId, - block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority()); + block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority(), block->getExtraKeys()); } enqueueEvent({mEventId++, data, windowSize, mAttentionDpRank}); diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index e37e68ffe3..4154be6482 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -284,6 +284,11 @@ tk::KVCacheIndex::UnderlyingType KVCacheBlock::getMemoryPoolBlockIndex() const return mMemoryPoolBlockIndex.get(); } +std::vector KVCacheBlock::getExtraKeys() const +{ + return mBlockKey.extraKeys; +} + bool KVCacheBlock::isPrimary() const { return mMemoryPoolBlockIndex.isPrimary(); diff --git a/cpp/tensorrt_llm/executor/serialization.cpp b/cpp/tensorrt_llm/executor/serialization.cpp index e2c6f874db..8e79563b7d 100644 --- a/cpp/tensorrt_llm/executor/serialization.cpp +++ b/cpp/tensorrt_llm/executor/serialization.cpp @@ -2333,6 +2333,7 @@ size_t Serialization::serializedSize(KVCacheStoredBlockData const& data) totalSize += su::serializedSize(data.loraId); totalSize += su::serializedSize(data.cacheLevel); totalSize += su::serializedSize(data.priority); + totalSize += su::serializedSize(data.mmKeys); return totalSize; } @@ -2343,6 +2344,7 @@ void Serialization::serialize(KVCacheStoredBlockData const& data, std::ostream& su::serialize(data.loraId, os); su::serialize(data.cacheLevel, os); su::serialize(data.priority, os); + su::serialize(data.mmKeys, os); } KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::istream& is) @@ -2352,8 +2354,9 @@ KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::ist auto loraId = su::deserialize>(is); auto cacheLevel = su::deserialize(is); auto priority = su::deserialize(is); + auto mmKeys = su::deserialize>(is); - return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority}; + return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority, mmKeys}; } // KVcacheRemovedData diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp index ae4936a4df..388af63cac 100644 --- a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp @@ -221,7 +221,22 @@ void initBindings(nb::module_& m) .def_ro("tokens", &tle::KVCacheStoredBlockData::tokens) .def_ro("lora_id", &tle::KVCacheStoredBlockData::loraId) .def_ro("cache_level", &tle::KVCacheStoredBlockData::cacheLevel) - .def_ro("priority", &tle::KVCacheStoredBlockData::priority); + .def_ro("priority", &tle::KVCacheStoredBlockData::priority) + .def_prop_ro("mm_keys", + [](tle::KVCacheStoredBlockData const& self) + { + // Convert std::vector to Python list of tuples (bytes, int) + // MmKey = std::pair, SizeType32> + nb::list result; + for (auto const& mmKey : self.mmKeys) + { + auto const& hashArray = mmKey.first; + auto offset = mmKey.second; + nb::bytes hashBytes(reinterpret_cast(hashArray.data()), hashArray.size()); + result.append(nb::make_tuple(hashBytes, offset)); + } + return result; + }); nb::class_(executor_kv_cache, "KVCacheStoredData") .def_ro("parent_hash", &tle::KVCacheStoredData::parentHash) diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp index bbb843bedb..e3d9d6c1c6 100644 --- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp @@ -221,7 +221,22 @@ void initBindings(pybind11::module_& m) .def_readonly("tokens", &tle::KVCacheStoredBlockData::tokens) .def_readonly("lora_id", &tle::KVCacheStoredBlockData::loraId) .def_readonly("cache_level", &tle::KVCacheStoredBlockData::cacheLevel) - .def_readonly("priority", &tle::KVCacheStoredBlockData::priority); + .def_readonly("priority", &tle::KVCacheStoredBlockData::priority) + .def_property_readonly("mm_keys", + [](tle::KVCacheStoredBlockData const& self) + { + // Convert std::vector to Python list of tuples (bytes, int) + // MmKey = std::pair, SizeType32> + py::list result; + for (auto const& mmKey : self.mmKeys) + { + auto const& hashArray = mmKey.first; + auto offset = mmKey.second; + py::bytes hashBytes(reinterpret_cast(hashArray.data()), hashArray.size()); + result.append(py::make_tuple(hashBytes, offset)); + } + return result; + }); py::class_(executor_kv_cache, "KVCacheStoredData") .def_readonly("parent_hash", &tle::KVCacheStoredData::parentHash) diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py index f028d41e55..7aac4cb35f 100644 --- a/examples/llm-api/quickstart_advanced.py +++ b/examples/llm-api/quickstart_advanced.py @@ -1,4 +1,6 @@ import argparse +import json +import time from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import (AttentionDpConfig, AutoDecodingConfig, @@ -90,6 +92,9 @@ def add_llm_args(parser): default=False, action='store_true') parser.add_argument("--tokens_per_block", type=int, default=32) + parser.add_argument('--log_kv_cache_events', + default=False, + action='store_true') # Runtime parser.add_argument('--disable_overlap_scheduler', @@ -190,7 +195,7 @@ def setup_llm(args, **kwargs): free_gpu_memory_fraction=args.kv_cache_fraction, dtype=args.kv_cache_dtype, tokens_per_block=args.tokens_per_block, - ) + event_buffer_max_size=1024 if args.log_kv_cache_events else 0) spec_decode_algo = args.spec_decode_algo.upper( ) if args.spec_decode_algo is not None else None @@ -355,6 +360,13 @@ def main(): f"[{i}]{sequence_id_text} Generation {output_name}: {sequence.additional_generation_outputs[output_name]}" ) + if args.log_kv_cache_events: + time.sleep(1) # Wait for events to be dispatched + events = llm.get_kv_cache_events(5) + print("=== KV_CACHE_EVENTS_START ===") + print(json.dumps(events, indent=2)) + print("=== KV_CACHE_EVENTS_END ===") + if __name__ == '__main__': main() diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py index 66721a2526..8a6e0c67f8 100644 --- a/examples/llm-api/quickstart_multimodal.py +++ b/examples/llm-api/quickstart_multimodal.py @@ -1,6 +1,7 @@ import argparse import json import os +import time from quickstart_advanced import add_llm_args, setup_llm @@ -264,6 +265,14 @@ def main(): print( f"[{i}] Prompt: {output['user_input']!r}, Generated text: {output['assistant_response']!r}" ) + + if args.log_kv_cache_events: + time.sleep(1) # Wait for events to be dispatched + events = llm.get_kv_cache_events(5) + print("=== KV_CACHE_EVENTS_START ===") + print(json.dumps(events, indent=2)) + print("=== KV_CACHE_EVENTS_END ===") + return # Original single-turn processing logic @@ -272,6 +281,7 @@ def main(): args.prompt = example_medias_and_prompts[args.modality]["prompt"] if args.media is None: args.media = example_medias_and_prompts[args.modality]["media"] + inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer, model_dir=str(llm._hf_model_dir), model_type=model_type, @@ -281,7 +291,6 @@ def main(): image_data_format=image_format, num_frames=args.num_frames, device=args.device) - lora_request = None if args.load_lora: lora_request = model_class.lora_request(len(inputs), args.modality, @@ -306,6 +315,13 @@ def main(): if args.logprobs: print(f"[{i}] Logprobs: {output.outputs[0].logprobs}") + if args.log_kv_cache_events: + time.sleep(1) # Wait for events to be dispatched + events = llm.get_kv_cache_events(5) + print("=== KV_CACHE_EVENTS_START ===") + print(json.dumps(events, indent=2)) + print("=== KV_CACHE_EVENTS_END ===") + if __name__ == "__main__": main() diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index cdcd012bd3..bb264c939c 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -1117,7 +1117,9 @@ class KVCacheEventSerializer: "cache_level": data.cache_level, "priority": - data.priority + data.priority, + "mm_keys": + KVCacheEventSerializer._mm_keys_to_json(data) } @staticmethod @@ -1153,6 +1155,30 @@ class KVCacheEventSerializer: "token_extra_id": data.token_extra_id } + @staticmethod + def _mm_key_to_json(data): + # MmKey is a pair of (array, SizeType32) + hash_array, start_offset = data + + # Convert array to hex string + hash_hex = ''.join(f'{b:02x}' for b in hash_array) + return { + "type": "mm_key", + "hash": hash_hex, + "start_offset": start_offset + } + + @staticmethod + def _mm_keys_to_json(data): + # MmKeys is a list of MmKey + if hasattr(data, 'mm_keys') and data.mm_keys: + return [ + KVCacheEventSerializer._mm_key_to_json(mm_key) + for mm_key in data.mm_keys + ] + else: + return [] + def set_prometheus_multiproc_dir() -> object: # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.10/python/sglang/srt/utils.py#L1266 diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8eb0eef98d..8acd4d9178 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -254,6 +254,7 @@ accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075) accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143, 5481206 WNF) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5488118) +accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5738168) test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523) accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319) examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940) diff --git a/tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py b/tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py index e089fbd859..4dc0564711 100644 --- a/tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py +++ b/tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py @@ -1,5 +1,6 @@ import json import os +import time from pathlib import Path import pytest @@ -288,3 +289,85 @@ def test_multi_request_batch_chat(model_key, multimodal_model_config): zip(ref_output.outputs, test_output.outputs)): assert ref_gen.text == test_gen.text, \ f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}" + + +@pytest.mark.parametrize( + "prompts,expected_num_duplicates", + [ + # Full reuse: same media + same prompts + # All blocks are reused, thus no duplicates + (["Describe the natural environment in the image."] * 2, 0), + # Partial reuse: same media + different prompts + # Prefix blocks are reused, thus 2 duplicates + ([ + "Describe the natural environment in the image.", + "What objects can you see in the image?", + "Describe the weather in the image.", + ], 2), + ]) +def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates, + multimodal_model_config): + """Test mm_keys in KV cache events with cache reuse scenarios. + + This test verifies: + 1. KV cache events contain mm_keys for multimodal blocks + 2. mm_keys have the expected structure (hash + start_offset) + 3. Cache reuse behavior based on media and prompts: + - Same media + same prompts: full reuse (0 duplicate offsets) + - Same media + different prompts: partial reuse (prefix blocks reused) + """ + encoder_model_dir = multimodal_model_config['model_dir'] + + max_tokens = 16 + free_gpu_memory_fraction = 0.6 + + # Use same image for all prompts + media = [example_images[0]] * len(prompts) + + sampling_params = SamplingParams(max_tokens=max_tokens) + kv_cache_config = KvCacheConfig( + enable_block_reuse=True, + free_gpu_memory_fraction=free_gpu_memory_fraction, + event_buffer_max_size=1024, # Enable KV cache events + ) + + llm = LLM(model=encoder_model_dir, + backend='pytorch', + kv_cache_config=kv_cache_config, + max_batch_size=1) + + config_path = os.path.join(llm._hf_model_dir, 'config.json') + with open(config_path, 'r') as f: + model_config = json.load(f) + model_type = model_config['model_type'] + + inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer, + model_dir=llm._hf_model_dir, + model_type=model_type, + modality="image", + prompts=prompts, + media=media, + image_data_format="pt") + + # Generate for each input separately to test KV cache reuse + for inp in inputs: + _ = llm.generate([inp], sampling_params=sampling_params) + + time.sleep(0.5) # Wait for events to be dispatched + events = llm.get_kv_cache_events(10) + + # Extract mm_keys offsets from stored events + mm_keys_offsets = [] + for event in events: + if event and event.get("data", {}).get("type") == "stored": + for block in event["data"].get("blocks", []): + if block.get("mm_keys"): + for mm_key in block["mm_keys"]: + assert "hash" in mm_key, "mm_key should have 'hash' field" + assert "start_offset" in mm_key, "mm_key should have 'start_offset' field" + mm_keys_offsets.append(mm_key["start_offset"]) + + num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets)) + assert num_duplicates == expected_num_duplicates, ( + f"Expected {expected_num_duplicates} duplicate mm_keys offsets, " + f"got {num_duplicates}. Offsets: {mm_keys_offsets}") diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py index db90a34413..ee5da20c43 100644 --- a/tests/unittest/llmapi/test_llm_kv_cache_events.py +++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py @@ -93,6 +93,9 @@ def test_kv_cache_event_data_serialization(): assert serialized_event[0]["data"]["parent_hash"] is None assert len(serialized_event[0]["data"]["blocks"]) == 1 assert len(serialized_event[0]["data"]["blocks"][0]["tokens"]) == 4 + # Verify mm_keys field exists (empty for text-only requests) + assert "mm_keys" in serialized_event[0]["data"]["blocks"][0] + assert serialized_event[0]["data"]["blocks"][0]["mm_keys"] == [] req2 = create_llm_request(1, [1, 2, 3, 4, 5]) kv_cache_manager.impl.add_sequence(req2.py_request_id, req2.prompt_len, 1, @@ -104,6 +107,109 @@ def test_kv_cache_event_data_serialization(): serialized_event = KVCacheEventSerializer.serialize(events) +def test_mm_keys_serialization(): + """Test serialization of multimodal keys (mm_keys) in KV cache events.""" + # Test _mm_key_to_json with a mock mm_key tuple (bytes, int) + # MmKey from C++ is converted to (bytes, int) tuple by pybind11 + mock_hash = b'\x01\x02\x03\x04\x05\x06\x07\x08' + b'\x00' * 24 # 32 bytes + mock_offset = 42 + mock_mm_key = (mock_hash, mock_offset) + + result = KVCacheEventSerializer._mm_key_to_json(mock_mm_key) + + assert result["type"] == "mm_key" + assert result["start_offset"] == 42 + # Hash should be converted to hex string + assert result["hash"] == "0102030405060708" + "00" * 24 + assert len(result["hash"]) == 64 # 32 bytes = 64 hex chars + + # Test with different hash values + mock_hash2 = bytes(range(32)) # 0x00 to 0x1f + mock_mm_key2 = (mock_hash2, 100) + result2 = KVCacheEventSerializer._mm_key_to_json(mock_mm_key2) + + assert result2["type"] == "mm_key" + assert result2["start_offset"] == 100 + expected_hash = ''.join(f'{i:02x}' for i in range(32)) + assert result2["hash"] == expected_hash + + +def test_mm_keys_deserialization(): + """Test deserialization of mm_keys JSON back to 32-byte hash.""" + # Test case 1: Simple hash pattern + mock_hash = b'\x01\x02\x03\x04\x05\x06\x07\x08' + b'\x00' * 24 # 32 bytes + mock_offset = 42 + mock_mm_key = (mock_hash, mock_offset) + + # Serialize to JSON + json_result = KVCacheEventSerializer._mm_key_to_json(mock_mm_key) + + # Deserialize hex string back to bytes + recovered_hash = bytes.fromhex(json_result["hash"]) + + # Verify the recovered hash matches the original + assert recovered_hash == mock_hash + assert len(recovered_hash) == 32 + assert json_result["start_offset"] == mock_offset + + # Test case 2: Sequential bytes 0x00 to 0x1f + mock_hash2 = bytes(range(32)) + mock_offset2 = 100 + mock_mm_key2 = (mock_hash2, mock_offset2) + + json_result2 = KVCacheEventSerializer._mm_key_to_json(mock_mm_key2) + recovered_hash2 = bytes.fromhex(json_result2["hash"]) + + assert recovered_hash2 == mock_hash2 + assert len(recovered_hash2) == 32 + assert json_result2["start_offset"] == mock_offset2 + + # Test case 3: All 0xFF bytes + mock_hash3 = b'\xff' * 32 + mock_offset3 = 255 + mock_mm_key3 = (mock_hash3, mock_offset3) + + json_result3 = KVCacheEventSerializer._mm_key_to_json(mock_mm_key3) + recovered_hash3 = bytes.fromhex(json_result3["hash"]) + + assert recovered_hash3 == mock_hash3 + assert len(recovered_hash3) == 32 + assert json_result3["hash"] == "ff" * 32 + + # Test case 4: Random-like pattern + mock_hash4 = bytes([0xde, 0xad, 0xbe, 0xef] + [0xca, 0xfe] * 14) + mock_offset4 = 1024 + mock_mm_key4 = (mock_hash4, mock_offset4) + + json_result4 = KVCacheEventSerializer._mm_key_to_json(mock_mm_key4) + recovered_hash4 = bytes.fromhex(json_result4["hash"]) + + assert recovered_hash4 == mock_hash4 + assert len(recovered_hash4) == 32 + + +def test_mm_keys_in_stored_events(): + """Test that mm_keys field is present in stored block events.""" + llm = create_llm() + sampling_params = SamplingParams(max_tokens=6, temperature=0.01) + prompt = "Hello, my name is" + + _ = llm.generate(prompt, sampling_params=sampling_params) + + events = llm.get_kv_cache_events(5) + + # Find stored events and verify mm_keys field + for event in events: + if event and event["data"]["type"] == "stored": + blocks = event["data"]["blocks"] + for block in blocks: + # mm_keys should always be present (empty list for text-only) + assert "mm_keys" in block + assert isinstance(block["mm_keys"], list) + # For text-only requests, mm_keys should be empty + assert block["mm_keys"] == [] + + def test_expected_kv_cache_events(): llm = create_llm() sampling_params = SamplingParams(max_tokens=6, temperature=0.01) From bf923a1074e64191febbfc7622441124588e03ff Mon Sep 17 00:00:00 2001 From: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:46:37 +0800 Subject: [PATCH 128/172] [None] [chore] Comments cleanup (#9978) Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> --- ...led_contiguous_gather_grouped_gemm_swiglu_fusion.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py index 5b9d06bb17..2cd24ebe9c 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py @@ -228,10 +228,8 @@ class BlockScaledContiguousGatherGroupedGemmKernel: - Float32 - Float16/BFloat16 - Float8E4M3FN/Float8E5M2 - # {$nv-internal-release begin} # Note: Float4E2M1FN output includes SFC generation and quantization support for internal testing. - Float4E2M1FN (with scale factor generation) - # {$nv-internal-release end} :note: Constraints: - MMA tiler M must be 128 or 256 (use_2cta_instrs) @@ -659,7 +657,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: self.mma_inst_shape_mn, ) - # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs. # {$nv-internal-release} + # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs. tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma( self.a_dtype, self.a_major_mode, @@ -696,10 +694,8 @@ class BlockScaledContiguousGatherGroupedGemmKernel: internal_type=cutlass.Int16, ) - # {$nv-internal-release begin} # This modifies the layout to handle overlapping 256x(# of scale factors for a single column of B (nNSF)) # logical blocks for SFB when cta_tile_shape_n=192. - # {$nv-internal-release end} if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192): x = tma_tensor_sfb.stride[0][1] y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4) @@ -1524,7 +1520,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: # ((atom_v, rest_v), RestK) # tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)] - # Apply SFB slicing hack when cta_tile_shape_n=64 # {$nv-internal-release} + # Apply SFB slicing hack when cta_tile_shape_n=64 slice_n = mma_tile_coord_mnl[1] if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64): slice_n = mma_tile_coord_mnl[1] // 2 @@ -1707,7 +1703,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)] # Apply TMEM pointer offset hack when cta_tile_shape_n=192 or - # cta_tile_shape_n=64 # {$nv-internal-release} + # cta_tile_shape_n=64 tCtSFB_mma = tCtSFB if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192): From 3be5f3abcf77c9f05d5867e52a2ac3a028a9d556 Mon Sep 17 00:00:00 2001 From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:02:48 +0800 Subject: [PATCH 129/172] [None][fix] Fix regex pattern for cubin filtering (#9914) Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/CMakeLists.txt index 7cf669de18..f709496b5b 100644 --- a/cpp/tensorrt_llm/kernels/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/CMakeLists.txt @@ -40,9 +40,7 @@ list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*") function(filter_cuda_archs ARCH SOURCES_VAR) if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - set(FILTER_REGEX - ".*_sm(_)?${ARCH}[.]cubin[.]cpp|^.*Sm(_)?${ARCH}.*cubin.cpp$|.*_sm(_)?${ARCH}[.]cu|^.*Sm(_)?${ARCH}.*cu$" - ) + set(FILTER_REGEX ".*[Ss][Mm]_?${ARCH}(af)?.*(cubin\.cpp|\.cu)$") list(APPEND SOURCES ${${SOURCES_VAR}}) list(APPEND SOURCES_FILTERED ${SOURCES}) list(FILTER SOURCES_FILTERED INCLUDE REGEX "${FILTER_REGEX}") From 4bf42f8fa8860a4626b6642e4a6293c15968d372 Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:03:16 +0800 Subject: [PATCH 130/172] [https://nvbugs/5580297][fix] Skip capture request error test from Ray stage (#9947) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_multi_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index d885f477c0..dd175a4809 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -32,7 +32,7 @@ from .test_llm import ( run_llm_with_postprocess_parallel_and_result_handler, run_llm_abort_request, sampling_params_for_aborting_request) from .test_llm_kv_cache_events import create_llm -from utils.util import (skip_gpu_memory_less_than, skip_single_gpu, +from utils.util import (skip_gpu_memory_less_than, skip_single_gpu, skip_ray, unittest_name_func, force_ampere) # isort: on @@ -455,6 +455,7 @@ def test_llm_get_stats_async_tp2(pytorch_backend): llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend) +@skip_ray def test_llm_capture_request_error(): _test_llm_capture_request_error(pytorch_backend=False, tp_size=2) From 355e06d66dd9db0e2a32b690276fe4d359ae19e5 Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:16:50 +0800 Subject: [PATCH 131/172] [None][doc] update readme for rpc (#9972) Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> --- tensorrt_llm/executor/rpc/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorrt_llm/executor/rpc/README.md b/tensorrt_llm/executor/rpc/README.md index 76d7b846ab..57229b0f2d 100644 --- a/tensorrt_llm/executor/rpc/README.md +++ b/tensorrt_llm/executor/rpc/README.md @@ -83,3 +83,8 @@ except RPCError as e: # Shutdown server from client client.shutdown_server() ``` + +## Network Security + +The RPC supports built-in HMAC-based authentication to secure the communication between the server and the client. +To enable that, you need to provide a shared secret key (bytes) to both the `RPCServer` and `RPCClient`. From f5696df28522593ac5411e346af16f268bc42090 Mon Sep 17 00:00:00 2001 From: xxi <95731198+xxi-nv@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:47:15 +0800 Subject: [PATCH 132/172] [TRTLLM-8961][feat] ConfigurableMoE support DeepGemm (#9858) --- .../modules/fused_moe/configurable_moe.py | 104 ++++- .../_torch/modules/fused_moe/create_moe.py | 4 +- .../modules/fused_moe/fused_moe_deepgemm.py | 374 +++++++++++------- .../defs/accuracy/test_llm_api_pytorch.py | 11 +- tests/integration/defs/conftest.py | 1 + .../test_lists/test-db/l0_b200.yml | 2 + tests/unittest/_torch/modules/conftest.py | 1 + .../unittest/_torch/modules/test_fused_moe.py | 29 +- 8 files changed, 373 insertions(+), 153 deletions(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py index e5bb52ad20..12e1eb3ca0 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py @@ -28,7 +28,7 @@ Design Principles: 4. Unified EPLB integration for backends that support it """ -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Tuple, Union import torch @@ -456,6 +456,32 @@ class ConfigurableMoE(MoE): return outputs + def _prepare_workspace_deepgemm( + self, + x: Union[torch.Tensor, Fp4QuantizedTensor], + all_rank_num_tokens: List[int], + ) -> Optional[torch.Tensor]: + """ + Prepare workspace for DeepGemmFusedMoE backend. + + Args: + x: Input tensor + all_rank_num_tokens: List of token counts for all ranks (used when use_dp is True) + + Returns: + Workspace tensor or None if not using DeepGemmFusedMoE + """ + if not isinstance(self.backend, DeepGemmFusedMoE): + return None + + # Calculate the number of rows + num_rows = x.shape[0] + if self.use_dp: + num_rows = sum(all_rank_num_tokens) + + workspaces = self.backend.get_workspaces([num_rows]) + return workspaces[0] + def _forward_single_chunk( self, x: Union[torch.Tensor, Fp4QuantizedTensor], @@ -473,6 +499,9 @@ class ConfigurableMoE(MoE): is_first_call = self.repeat_idx == 0 is_last_call = self.repeat_idx == self.repeat_count - 1 + # ========== Create workspace for DeepGemmFusedMoE ========== + workspace = self._prepare_workspace_deepgemm(x, all_rank_num_tokens) + # Execute unified flow (handles both separated and fused routing) outputs = self._forward_chunk_impl( x, @@ -483,6 +512,7 @@ class ConfigurableMoE(MoE): is_first_call, is_last_call, do_finalize, + workspace=workspace, ) return outputs @@ -497,6 +527,7 @@ class ConfigurableMoE(MoE): is_first_call: bool, is_last_call: bool, do_finalize: bool = True, + workspace: Optional[dict] = None, ) -> torch.Tensor: """ Unified execution flow for all backends @@ -667,7 +698,7 @@ class ConfigurableMoE(MoE): token_final_scales=token_final_scales, x_sf=x_sf, **self._get_backend_kwargs( - router_logits, do_finalize, all_rank_num_tokens, output_dtype, x + router_logits, do_finalize, all_rank_num_tokens, output_dtype, x, workspace ), ) @@ -688,6 +719,54 @@ class ConfigurableMoE(MoE): return final_hidden_states + def _prepare_workspaces_for_chunk( + self, + all_rank_num_tokens_list: List[Optional[List[int]]], + chunk_size_list: List[int], + use_multi_stream: bool, + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Prepare workspaces for chunked execution with DeepGemmFusedMoE backend. + This will also be used for alltoall communication in the future. + + Args: + all_rank_num_tokens_list: List of token counts per rank for each chunk (None if not using DP) + chunk_size_list: List of chunk sizes + use_multi_stream: Whether to use multi-stream execution (requires workspace_1) + + Returns: + Tuple of (workspace_0, workspace_1), where workspace_1 is None if not using multi-stream + """ + workspace_0 = None + workspace_1 = None + + if not isinstance(self.backend, DeepGemmFusedMoE): + return workspace_0, workspace_1 + + # Always need at least workspace_0 + chunk_size_0 = ( + sum(all_rank_num_tokens_list[0]) + if self.use_dp and all_rank_num_tokens_list[0] is not None + else chunk_size_list[0] + ) + workspace_chunk_sizes = [chunk_size_0] + + # Add workspace_1 if using multi-stream for alternating between streams + if use_multi_stream: + chunk_size_1 = ( + sum(all_rank_num_tokens_list[1]) + if self.use_dp and all_rank_num_tokens_list[1] is not None + else chunk_size_list[1] + ) + workspace_chunk_sizes.append(chunk_size_1) + + workspaces = self.backend.get_workspaces(workspace_chunk_sizes) + workspace_0 = workspaces[0] + if use_multi_stream: + workspace_1 = workspaces[1] + + return workspace_0, workspace_1 + def _forward_multiple_chunks( self, x: Union[torch.Tensor, Fp4QuantizedTensor], @@ -734,12 +813,20 @@ class ConfigurableMoE(MoE): x_list = x.split(chunk_size_list) router_logits_list = router_logits.split(chunk_size_list) + # Determine if we need multiple streams for overlapped execution + use_multi_stream = not use_all_to_all and self.aux_stream is not None + # ========== Setup auxiliary stream ========== - if not use_all_to_all and self.aux_stream is not None: + if use_multi_stream: self.event_dict[EventType.Main].record() with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.Main].wait() + # ========== Create workspace for DeepGemmFusedMoE ========== + workspace_0, workspace_1 = self._prepare_workspaces_for_chunk( + all_rank_num_tokens_list, chunk_size_list, use_multi_stream + ) + # ========== Execute chunking with overlap ========== outputs_list = [] for idx_chunk, (x_chunk, router_logits_chunk) in enumerate(zip(x_list, router_logits_list)): @@ -747,7 +834,7 @@ class ConfigurableMoE(MoE): is_first_call = idx_chunk == 0 and self.repeat_idx == 0 is_last_call = idx_chunk == num_chunks - 1 and self.repeat_idx == self.repeat_count - 1 - if not use_all_to_all and self.aux_stream is not None: + if use_multi_stream: # Alternate between main stream and auxiliary stream # Each stream processes complete chunks (forward + reducescatter) if idx_chunk % 2 == 0: @@ -762,6 +849,7 @@ class ConfigurableMoE(MoE): is_first_call, is_last_call, do_finalize, + workspace=workspace_0, ) else: # Odd chunk: execute on main stream @@ -774,6 +862,7 @@ class ConfigurableMoE(MoE): is_first_call, is_last_call, do_finalize, + workspace=workspace_1, ) else: # No overlap @@ -786,12 +875,13 @@ class ConfigurableMoE(MoE): is_first_call, is_last_call, do_finalize, + workspace=workspace_0, ) outputs_list.append(outputs) # ========== Wait for auxiliary stream to complete ========== - if not use_all_to_all and self.aux_stream is not None: + if use_multi_stream: # Wait for auxiliary stream to complete all its chunks with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.MoeChunkingOverlap].record() @@ -942,6 +1032,7 @@ class ConfigurableMoE(MoE): all_rank_num_tokens: Optional[List[int]] = None, output_dtype: Optional[torch.dtype] = None, x: Optional[torch.Tensor] = None, + workspace: Optional[dict] = None, ) -> Dict: """ Get backend-specific keyword arguments for run_moe @@ -1014,7 +1105,8 @@ class ConfigurableMoE(MoE): # DeepGemm-specific parameters elif self.backend.__class__ == DeepGemmFusedMoE: - pass + if workspace is not None: + kwargs["workspace"] = workspace # TRTLLMGen-specific parameters elif self.backend.__class__ == TRTLLMGenFusedMoE: diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py index f921e25014..281b461006 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py @@ -345,8 +345,8 @@ def create_moe( moe_cls = get_moe_cls(model_config, override_quant_config) if ENABLE_CONFIGURABLE_MOE or moe_cls == CuteDslFusedMoE: - # ConfigurableMoE only supports TRTLLMGenFusedMoE and CuteDslFusedMoE backends - if moe_cls in (TRTLLMGenFusedMoE, CuteDslFusedMoE, CutlassFusedMoE): + if moe_cls in (DeepGemmFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE, + CutlassFusedMoE): return ConfigurableMoE( routing_method=routing_method, num_experts=num_experts, diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py index 292eed4c9e..f320b4085e 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py @@ -380,6 +380,8 @@ class DeepGemmFusedMoE(CutlassFusedMoE): VANILLA, apply_router_weight_on_input: bool = False, layer_idx: Optional[int] = None, + init_load_balancer: bool = True, + without_comm: bool = False, ): # moe_max_num_tokens is set in ModelConfig.__post_init__ if not specified # The default value is max_num_tokens * dp_size @@ -407,6 +409,8 @@ class DeepGemmFusedMoE(CutlassFusedMoE): weight_loading_mode=weight_loading_mode, apply_router_weight_on_input=apply_router_weight_on_input, layer_idx=layer_idx, + init_load_balancer=init_load_balancer, + without_comm=without_comm, ) def get_workspace(self, m_max: int, group_size: int): @@ -446,6 +450,23 @@ class DeepGemmFusedMoE(CutlassFusedMoE): } return workspace + def get_workspaces(self, chunk_size_list: list[int]) -> list[dict]: + """ + Get workspaces for multiple chunks. + + Args: + chunk_size_list: List of chunk sizes + + Returns: + List of workspace dictionaries, one per chunk + """ + workspaces = [] + for chunk_size in chunk_size_list: + m_max = fp8_utils.align(chunk_size, 128) + workspace = self.get_workspace(m_max, 128) + workspaces.append(workspace) + return workspaces + def _get_quant_method(self): if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant( exclude_kv_cache=True): @@ -462,6 +483,208 @@ class DeepGemmFusedMoE(CutlassFusedMoE): """DeepGEMM backend currently doesn't support alltoall; honor overrides but default to disabled.""" return AlltoallMethodType.NotEnabled + def quantize_input( + self, + x: Union[torch.Tensor, Fp4QuantizedTensor], + post_quant_comm: bool = True, + ): + """Quantize inputs prior to post-communication (alltoall/allgather) or before MoE computation. + + Args: + x: Input tensor to quantize + post_quant_comm: + If True, quantize for post-quant communication path. + If False, quantize for non-communication path + + Returns: (x, x_sf) where x_sf is None for DeepGemm + + For DeepGemm with has_deepseek_fp8_block_scales: + - Quantization is deferred to run_moe (after permutation) + - WAR: FP8 block scales doesn't support permutation of quantized inputs + - Similar to CuteDslFusedMoE (see fused_moe_cute_dsl.py:242-253) + """ + x_sf = None + if self.has_deepseek_fp8_block_scales: + # FP8 block scales doesn't support permutation of quantized inputs. + # WAR: The quantization is in run_moe. + pass + else: + raise ValueError( + f"{self.__class__.__name__} doesn't support quantization mode {self.quant_config.quant_mode}." + ) + + return x, x_sf + + def run_moe( + self, + x: torch.Tensor, + token_selected_experts: torch.Tensor, + token_final_scales: torch.Tensor, + x_sf: Optional[torch.Tensor] = None, + workspace: dict = None, + ) -> torch.Tensor: + """ + Run MoE computation with DeepGemm backend. + + This method encapsulates the core MoE computation logic, handling FP8 block scales + quantization with DeepGemm backend. + + Args: + # Standard MoE interface parameters: + x: Input hidden states (unquantized for DeepGemm) + token_selected_experts: Expert IDs [num_tokens, top_k]. If EPLB is enabled, + this represents expert slots [num_tokens, top_k] instead. + token_final_scales: Final scaling factors for each token + x_sf: Input scale factors (should be None for DeepGemm) + workspace: Workspace dictionary containing buffers for intermediate results + Required keys: 'workspace_0', 'workspace_1', 'workspace_sf' + + Returns: + final_hidden_states tensor. + + Note: Similar to CuteDslFusedMoE.run_moe_fp8_block_scales (fused_moe_cute_dsl.py:360-434) + """ + assert self.has_deepseek_fp8_block_scales + assert x_sf is None + assert workspace is not None, "workspace is required for DeepGemm backend" + assert token_selected_experts is not None + assert token_final_scales is not None + + # Permutation + ( + permuted_row_to_unpermuted_row_tensor, + permuted_token_selected_experts_tensor, + permuted_data_tensor, + expert_first_token_offset_tensor, + permuted_token_final_scales_tensor, + unpermuted_row_to_permuted_row_tensor, + ) = torch.ops.trtllm.moe_permute_op( + x, + token_selected_experts, + token_final_scales, + None, # w3_w1_weight.view(weight_dtype), + None, # w2_weight.view(weight_dtype), + None, # quant_scales, + input_sf=x_sf, + num_experts_on_rank=self.expert_size_per_partition, + tp_size=self.tp_size, + tp_rank=self.tp_rank, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + cluster_size=self.cluster_size, + cluster_rank=self.cluster_rank, + min_latency_mode=False, + use_fp8_block_scaling=True, + ) + + if permuted_data_tensor.numel() == 0: + return torch.zeros_like(x) + + # Preprocess after permute + masked_m, token_to_expert_map = preprocess_after_permute( + expert_first_token_offset_tensor, permuted_data_tensor) + + expected_m = (token_selected_experts.numel() + + self.expert_size_per_partition - + 1) // self.expert_size_per_partition + + # Padding and quantization + m_max = fp8_utils.align(x.shape[0], 128) + act_input_fp8 = set_strides(workspace["workspace_0"], + self.expert_size_per_partition, m_max, + self.hidden_size) + + m_padded = fp8_utils.align(m_max, 4) + scale_k = fp8_utils.ceil_div(self.hidden_size, 128) + scale_k_padded = fp8_utils.align(scale_k, 4) + act_input_sf = set_strides(workspace["workspace_sf"], + self.expert_size_per_partition, + scale_k_padded // 4, m_padded) + + act_input_sf = masked_index_copy_group_quant_fp8( + act_input_fp8, + act_input_sf, + permuted_data_tensor, + expert_first_token_offset_tensor, + token_to_expert_map, + group_size=128) + + # Grouped gemm 1 + h1 = set_strides(workspace["workspace_1"], + self.expert_size_per_partition, m_max, + self.intermediate_size_per_partition * 2) + + deepgemm_fp8_group_blockwise_gemm( + d=h1, + a=act_input_fp8, + b=self.w3_w1_weight, + sfa=act_input_sf, + sfb=self.quant_scales[0], + masked_m=masked_m, + expected_m=expected_m, + ) + + # Activation and quantization + act_input_fp8 = set_strides(workspace["workspace_0"], + self.expert_size_per_partition, m_max, + self.intermediate_size_per_partition) + + scale_k = fp8_utils.ceil_div(self.intermediate_size_per_partition, 128) + scale_k_padded = fp8_utils.align(scale_k, 4) + act_input_sf = set_strides(workspace["workspace_sf"], + self.expert_size_per_partition, + scale_k_padded // 4, m_padded) + + act_input_sf = fp8_utils.silu_and_mul_masked_post_quant_fwd( + output=act_input_fp8, + output_scale=act_input_sf, + input=h1, + quant_group_size=128, + masked_m=masked_m, + scale_ue8m0=True) + + # Grouped gemm 2 + h3 = set_strides(workspace["workspace_1"], + self.expert_size_per_partition, m_max, + self.hidden_size) + + deepgemm_fp8_group_blockwise_gemm( + d=h3, + a=act_input_fp8, + b=self.w2_weight, + sfa=act_input_sf, + sfb=self.quant_scales[1], + masked_m=masked_m, + expected_m=expected_m, + ) + + # Gather and finalize + triton_masked_index_gather(permuted_data_tensor, h3, + expert_first_token_offset_tensor, + token_to_expert_map) + + final_hidden_states = torch.ops.trtllm.moe_finalize_scale_op( + permuted_data_tensor, + None, # biases + token_final_scales, + unpermuted_row_to_permuted_row_tensor, + permuted_row_to_unpermuted_row_tensor, + token_selected_experts, + expert_first_token_offset_tensor, + False, # enable_alltoall + x.shape[0], # num_rows + x.shape[1], # (possibly padded) hidden_size + self.unpadded_hidden_size, # original hidden size + self.routing_method.top_k, + self.expert_size_per_partition, # num_experts_per_node + self.tp_size, + self.tp_rank, + self.ep_size, + self.ep_rank, + ) + + return final_hidden_states + @nvtx_range("[DG] forward") def forward_chunk( self, @@ -495,11 +718,10 @@ class DeepGemmFusedMoE(CutlassFusedMoE): token_final_scales = None # quantize inputs - use_deepseek_fp8_block_scale = False x_sf = None if self.has_any_quant: if self.has_deepseek_fp8_block_scales: - use_deepseek_fp8_block_scale = True + pass else: raise ValueError( f"unsupported quantization mode for CUTEDSL backend: {self.quant_config.quant_mode}" @@ -513,135 +735,13 @@ class DeepGemmFusedMoE(CutlassFusedMoE): dim=0, sizes=None if use_dp_padding else all_rank_num_tokens) - ( - permuted_row_to_unpermuted_row_tensor, - permuted_token_selected_experts_tensor, - permuted_data_tensor, - expert_first_token_offset_tensor, - permuted_token_final_scales_tensor, - unpermuted_row_to_permuted_row_tensor, - ) = torch.ops.trtllm.moe_permute_op( - x, - token_selected_experts, - token_final_scales, - None, # w3_w1_weight.view(weight_dtype), - None, # w2_weight.view(weight_dtype), - None, # quant_scales, - input_sf=x_sf, - num_experts_on_rank=self.expert_size_per_partition, - tp_size=self.tp_size, - tp_rank=self.tp_rank, - ep_size=self.ep_size, - ep_rank=self.ep_rank, - cluster_size=self.cluster_size, - cluster_rank=self.cluster_rank, - min_latency_mode=False, - use_fp8_block_scaling=use_deepseek_fp8_block_scale, - ) - - if permuted_data_tensor.numel() == 0: - return torch.zeros_like(x) - - masked_m, token_to_expert_map = preprocess_after_permute( - expert_first_token_offset_tensor, permuted_data_tensor) - - expected_m = (token_selected_experts.numel() + - self.expert_size_per_partition - - 1) // self.expert_size_per_partition - - # padding and quantization - m_max = fp8_utils.align(x.shape[0], 128) - act_input_fp8 = set_strides(workspace["workspace_0"], - self.expert_size_per_partition, m_max, - self.hidden_size) - - m_padded = fp8_utils.align(m_max, 4) - scale_k = fp8_utils.ceil_div(self.hidden_size, 128) - scale_k_padded = fp8_utils.align(scale_k, 4) - act_input_sf = set_strides(workspace["workspace_sf"], - self.expert_size_per_partition, - scale_k_padded // 4, m_padded) - - act_input_sf = masked_index_copy_group_quant_fp8( - act_input_fp8, - act_input_sf, - permuted_data_tensor, - expert_first_token_offset_tensor, - token_to_expert_map, - group_size=128) - - # grouped gemm 1 - h1 = set_strides(workspace["workspace_1"], - self.expert_size_per_partition, m_max, - self.intermediate_size_per_partition * 2) - - deepgemm_fp8_group_blockwise_gemm( - d=h1, - a=act_input_fp8, - b=self.w3_w1_weight, - sfa=act_input_sf, - sfb=self.quant_scales[0], - masked_m=masked_m, - expected_m=expected_m, - ) - - # activation and quantization - act_input_fp8 = set_strides(workspace["workspace_0"], - self.expert_size_per_partition, m_max, - self.intermediate_size_per_partition) - - scale_k = fp8_utils.ceil_div(self.intermediate_size_per_partition, 128) - scale_k_padded = fp8_utils.align(scale_k, 4) - act_input_sf = set_strides(workspace["workspace_sf"], - self.expert_size_per_partition, - scale_k_padded // 4, m_padded) - - act_input_sf = fp8_utils.silu_and_mul_masked_post_quant_fwd( - output=act_input_fp8, - output_scale=act_input_sf, - input=h1, - quant_group_size=128, - masked_m=masked_m, - scale_ue8m0=True) - - # grouped gemm 2 - h3 = set_strides(workspace["workspace_1"], - self.expert_size_per_partition, m_max, - self.hidden_size) - - deepgemm_fp8_group_blockwise_gemm( - d=h3, - a=act_input_fp8, - b=self.w2_weight, - sfa=act_input_sf, - sfb=self.quant_scales[1], - masked_m=masked_m, - expected_m=expected_m, - ) - - # gather and finalize - triton_masked_index_gather(permuted_data_tensor, h3, - expert_first_token_offset_tensor, - token_to_expert_map) - - final_hidden_states = torch.ops.trtllm.moe_finalize_scale_op( - permuted_data_tensor, - None, # biases - token_final_scales, - unpermuted_row_to_permuted_row_tensor, - permuted_row_to_unpermuted_row_tensor, - token_selected_experts, - expert_first_token_offset_tensor, - False, # enable_alltoall - x.shape[0], # num_rows - x.shape[1], # (possibly padded) hidden_size - self.unpadded_hidden_size, # original hidden size - self.routing_method.top_k, - self.expert_size_per_partition, # num_experts_per_node - self.tp_size, - self.tp_rank, - self.ep_size, - self.ep_rank, + # Call run_moe to handle the core MoE computation + final_hidden_states = self.run_moe( + x=x, + token_selected_experts=token_selected_experts, + token_final_scales=token_final_scales, + x_sf=x_sf, + workspace=workspace, ) return final_hidden_states @@ -683,15 +783,14 @@ class DeepGemmFusedMoE(CutlassFusedMoE): num_rows = x.shape[0] if self.use_dp: num_rows = sum(all_rank_num_tokens_padded) - m_max = fp8_utils.align(num_rows, 128) - workspace = self.get_workspace(m_max, 128) + workspaces = self.get_workspaces([num_rows]) outputs = self.forward_chunk( x, router_logits, output_dtype, all_rank_num_tokens=all_rank_num_tokens_padded, use_dp_padding=use_dp_padding, - workspace=workspace) + workspace=workspaces[0]) outputs = self.reducescatter_or_allreduce( outputs, all_rank_num_tokens=all_rank_num_tokens_padded, @@ -715,10 +814,9 @@ class DeepGemmFusedMoE(CutlassFusedMoE): ) if self.use_dp else chunk_size_list[0] chunk_size_1 = sum(all_rank_num_tokens_list[1] ) if self.use_dp else chunk_size_list[1] - workspace_0 = self.get_workspace(fp8_utils.align(chunk_size_0, 128), - 128) - workspace_1 = self.get_workspace(fp8_utils.align(chunk_size_1, 128), - 128) + workspaces = self.get_workspaces([chunk_size_0, chunk_size_1]) + workspace_0 = workspaces[0] + workspace_1 = workspaces[1] x_list = x.split(chunk_size_list) router_logits_list = router_logits.split(chunk_size_list) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 538277ba0b..af8dd5073f 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1427,8 +1427,17 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): (False, False, False, True), (True, False, True, True), (True, True, True, True)]) @parametrize_with_ids("mtp", ["disable", "eagle", "vanilla"]) + @pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph, - overlap_scheduler, torch_compile): + overlap_scheduler, torch_compile, + enable_configurable_moe, mocker): + # Patch MpiPoolSession to propagate env vars to MPI worker processes + env_value = "1" if enable_configurable_moe == 1 else "0" + patch_mpi_pool_session_for_env(mocker, + {"ENABLE_CONFIGURABLE_MOE": env_value}) + if torch_compile and mtp != "disable": pytest.skip("https://nvbugs/5252313") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 792eca22a7..be77132aaf 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2212,6 +2212,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): # Test cases that use enable_configurable_moe parameter and need ID conversion TESTS_WITH_CONFIGURABLE_MOE = [ "TestDeepSeekV3Lite::test_nvfp4_4gpus", + "TestDeepSeekV3Lite::test_fp8_block_scales", "TestGPTOSS::test_w4_4gpus", "TestGPTOSS::test_w4_4gpus_online_eplb", "TestQwen3_30B_A3B::test_w4a8_mxfp4", diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 3b59f51118..41ded067b0 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -89,6 +89,7 @@ l0_b200: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-CUTLASS-dtype1] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-CUTLASS] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-256-CUTLASS] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_deepgemm[enable_configurable_moe-dtype1-72-256-2560-DefaultMoeRoutingMethod] # ------------- AutoDeploy tests --------------- - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] @@ -162,5 +163,6 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[enable_configurable_moe-mtp=disable-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype diff --git a/tests/unittest/_torch/modules/conftest.py b/tests/unittest/_torch/modules/conftest.py index c7e85eeeea..dc6edce8d9 100644 --- a/tests/unittest/_torch/modules/conftest.py +++ b/tests/unittest/_torch/modules/conftest.py @@ -35,6 +35,7 @@ TESTS_WITH_CONFIGURABLE_MOE = [ "test_fused_moe_mxfp4_mxfp8", "test_fused_moe_w4a8_nvfp4_fp8", "test_fused_moe_wfp4a16", + "test_fused_moe_fp8_blockwise_deepgemm", ] diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index fa58161896..c66b136910 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -852,12 +852,23 @@ def test_fused_moe_fp8_blockwise_wide_ep(alltoall_method_type): [DefaultMoeRoutingMethod], ), ) +@pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") def test_fused_moe_fp8_blockwise_deepgemm(dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, + enable_configurable_moe, + mocker, mapping=None): + + mocker.patch.dict(os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" if enable_configurable_moe == 1 else "0" + }) + SEQ_LEN = seq_len HIDDEN_SIZE = hidden_size INTERMEDIATE_SIZE = 256 @@ -921,14 +932,20 @@ def test_fused_moe_fp8_blockwise_deepgemm(dtype, quant_config = QuantConfig(quant_algo=QuantAlgo.FP8_BLOCK_SCALES) - fused_moe = DeepGemmFusedMoE( - num_experts=NUM_EXPERTS, + # Create pretrained_config with necessary parameters + pretrained_config = PretrainedConfig() + pretrained_config.num_experts = NUM_EXPERTS + pretrained_config.hidden_size = HIDDEN_SIZE + pretrained_config.intermediate_size = INTERMEDIATE_SIZE + pretrained_config.torch_dtype = dtype + + fused_moe = create_moe( routing_method=routing_method, - hidden_size=HIDDEN_SIZE, - intermediate_size=INTERMEDIATE_SIZE, - dtype=dtype, reduce_results=True, - model_config=ModelConfig(quant_config=quant_config, mapping=mapping), + model_config=ModelConfig(pretrained_config=pretrained_config, + quant_config=quant_config, + mapping=mapping, + moe_backend="DEEPGEMM"), ) fused_moe.cuda() fused_moe.load_weights([weights]) From b57650f1e6fb481dbd9d78656a373c22d872a129 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:21:54 +0800 Subject: [PATCH 133/172] [TRTLLM-9794][ci] move test cases of gpt-oss to gb200 (#9934) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_dgx_b200.yml | 11 ----------- .../test_lists/test-db/l0_gb200_multi_gpus.yml | 9 +++++++++ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index f54045dd16..a9dd9b993e 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -26,18 +26,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4] - condition: ranges: diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index b53a64c61b..40fe6ed675 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -53,6 +53,15 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] From 0788635d6cf6b7545c3d0b1190369244a13702f7 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:30:28 +0800 Subject: [PATCH 134/172] [TRTLLM-9762] [doc] Update documents for GB300 NVL72 (#9987) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- docs/source/legacy/reference/support-matrix.md | 1 + docs/source/overview.md | 6 +++--- examples/disaggregated/slurm/benchmark/README.md | 2 +- examples/disaggregated/slurm/benchmark/config.yaml | 2 +- examples/disaggregated/slurm/benchmark/start_worker.sh | 4 ++-- examples/wide_ep/README.md | 10 +++++----- examples/wide_ep/slurm_scripts/README.md | 2 +- examples/wide_ep/slurm_scripts/config.yaml | 2 +- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/source/legacy/reference/support-matrix.md b/docs/source/legacy/reference/support-matrix.md index 1dc59fcfa0..24a3a01512 100644 --- a/docs/source/legacy/reference/support-matrix.md +++ b/docs/source/legacy/reference/support-matrix.md @@ -133,6 +133,7 @@ In addition, older architectures can have limitations for newer software release * - GPU Model Architectures - - [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/) + - [NVIDIA GB300 NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72/) - [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/) - [NVIDIA Grace Hopper Superchip](https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/) - [NVIDIA Hopper Architecture](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/) diff --git a/docs/source/overview.md b/docs/source/overview.md index 0df4f72539..471e57ff23 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -4,7 +4,7 @@ ## About TensorRT LLM -[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs. +[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs. ## Key Capabilities @@ -40,7 +40,7 @@ TensorRT LLM strives to support the most popular models on **Day 0**. ### 🚀 **Advanced Optimization & Production Features** - **[In-Flight Batching & Paged Attention](./features/paged-attention-ifb-scheduler.md)**: In-flight batching eliminates wait times by dynamically managing request execution, processing context and generation phases together for maximum GPU utilization and reduced latency. - **[Multi-GPU Multi-Node Inference](./features/parallel-strategy.md)**: Seamless distributed inference with tensor, pipeline, and expert parallelism across multiple GPUs and nodes through the Model Definition API. -- **[Advanced Quantization](./features/quantization.md)**: +- **[Advanced Quantization](./features/quantization.md)**: - **FP4 Quantization**: Native support on NVIDIA B200 GPUs with optimized FP4 kernels - **FP8 Quantization**: Automatic conversion on NVIDIA H100 GPUs leveraging Hopper architecture - **[Speculative Decoding](./features/speculative-decoding.md)**: Multiple algorithms including EAGLE, MTP and NGram @@ -54,7 +54,7 @@ TensorRT LLM strives to support the most popular models on **Day 0**. ### 🔧 **Latest GPU Architecture Support** TensorRT LLM supports the full spectrum of NVIDIA GPU architectures: -- **NVIDIA Blackwell**: B200, GB200, RTX Pro 6000 SE with FP4 optimization +- **NVIDIA Blackwell**: B200, GB200, B300, GB300, and RTX Pro 6000 SE with FP4 optimization - **NVIDIA Hopper**: H100, H200,GH200 with FP8 acceleration - **NVIDIA Ada Lovelace**: L40/L40S, RTX 40 series with FP8 acceleration - **NVIDIA Ampere**: A100, RTX 30 series for production workloads diff --git a/examples/disaggregated/slurm/benchmark/README.md b/examples/disaggregated/slurm/benchmark/README.md index 29b7301f08..5feb896aee 100644 --- a/examples/disaggregated/slurm/benchmark/README.md +++ b/examples/disaggregated/slurm/benchmark/README.md @@ -31,7 +31,7 @@ slurm: job_name: "" extra_args: "" # Additional SLURM arguments (e.g., "--gres=gpu:4 --exclude=node1") set_segment: true # Optional: whether to set the segment for the job - numa_bind: true # Enable NUMA binding for GB200 NVL72 + numa_bind: true # Enable NUMA binding for GB200/GB300 NVL72 ``` ### 2. Benchmark Configuration diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index afe7282348..b0952d9b7c 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -7,7 +7,7 @@ slurm: job_name: "" extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2" set_segment: true # Optional: whether to set the segment for the job - numa_bind: true # Only enable for GB200 NVL72 + numa_bind: true # Only enable for GB200/GB300 NVL72 # Benchmark Mode benchmark: diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh index f51fccd6f0..e2ac1f7530 100644 --- a/examples/disaggregated/slurm/benchmark/start_worker.sh +++ b/examples/disaggregated/slurm/benchmark/start_worker.sh @@ -27,10 +27,10 @@ done if [ "${numa_bind}" = "true" ]; then numa_bind_cmd="numactl -m 0,1" - echo "numactl -m 0,1 - Only allocate memory from nodes on GB200" + echo "numactl -m 0,1 - Only allocate memory from nodes on GB200/GB300 NVL72" else numa_bind_cmd="" - echo "Not binding memory. If on GB200, use \"numactl -m 0,1\" to only allocate memory from nodes." + echo "Not binding memory. If on GB200/GB300 NVL72, use \"numactl -m 0,1\" to only allocate memory from nodes." fi if [ "${benchmark_mode}" = "gen_only" ]; then diff --git a/examples/wide_ep/README.md b/examples/wide_ep/README.md index a9b52cbe8a..cce3993b32 100644 --- a/examples/wide_ep/README.md +++ b/examples/wide_ep/README.md @@ -21,13 +21,13 @@ Wide-EP solves these challenges through: ### Prerequisites -* GPU: GB200 NVL72, H20, or RTX 6000D. +* GPU: GB200 NVL72, GB300 NVL72, H20, or RTX 6000D. * OS: Linux * Drivers: CUDA Driver 575 or Later * Docker with NVIDIA Container Toolkit installed * Python3 and python3-pip (Optional, for accuracy evaluation only) -For GB200 NVL72, to make sure that Multi-Node NVLink (MNNVL) is correctly setup, check if the path `/dev/nvidia-caps-imex-channels` exists in the container. If the path doesn't exist, mount it when launching the Docker container. +For GB200/GB300 NVL72, to make sure that Multi-Node NVLink (MNNVL) is correctly setup, check if the path `/dev/nvidia-caps-imex-channels` exists in the container. If the path doesn't exist, mount it when launching the Docker container. For more information on NVIDIA IMEX service for NVLink networks, refer to https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/overview.html. @@ -108,16 +108,16 @@ If `never` is highlighted, enable Transparent HugePages by the following command echo madvise > /sys/kernel/mm/transparent_hugepage/enabled ``` -### GB200 NUMA binding +### GB200/GB300 NVL72 NUMA binding -GPU memory is also on NUMA nodes on GB200 and the system can also use that. Bind memory to CPU nodes to avoid GPU memory being used as host memory. +GPU memory is also on NUMA nodes on GB200/GB300 NVL72 and the system can also use that. Bind memory to CPU nodes to avoid GPU memory being used as host memory. ```bash numactl -m 0,1 ``` ### Shared Memory on EPLB -To achieve online load balancing, all expert weights are stored in shared host memory. Four ranks on the same GB200 node share the same expert weights to save memory. +To achieve online load balancing, all expert weights are stored in shared host memory. Four ranks on the same GB200/GB300 NVL72 node share the same expert weights to save memory. There is one environment variable `TRTLLM_EPLB_SHM_NAME` to specify the base name of the shared memory. This environment variable may need to be specified if there are multiple instances on one node. If not, you can ignore it. diff --git a/examples/wide_ep/slurm_scripts/README.md b/examples/wide_ep/slurm_scripts/README.md index a3865035fe..625dfc78e8 100644 --- a/examples/wide_ep/slurm_scripts/README.md +++ b/examples/wide_ep/slurm_scripts/README.md @@ -51,7 +51,7 @@ Before running benchmarks, ensure you have: 1. **SLURM Cluster Access**: Valid account and partition allocation 2. **Container Environment**: - NVIDIA Container Toolkit configured - - Required device mappings (e.g., `/dev/nvidia-caps-imex-channels` for GB200, `/dev/gdrdrv` for GDRCopy) + - Required device mappings (e.g., `/dev/nvidia-caps-imex-channels` for GB200/GB300 NVL72, `/dev/gdrdrv` for GDRCopy) 3. **Model Files**: Checkpoint files accessible from all cluster nodes 4. **Configuration**: Updated `config.yaml` with your cluster-specific settings diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml index c019c0d29d..2f10c9707d 100644 --- a/examples/wide_ep/slurm_scripts/config.yaml +++ b/examples/wide_ep/slurm_scripts/config.yaml @@ -6,7 +6,7 @@ slurm: job_time: "02:00:00" job_name: "" extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2" - numa_bind: true # Only enable for GB200 NVL72 + numa_bind: true # Only enable for GB200/GB300 NVL72 # Benchmark Mode benchmark: From 8f144d9282d08a8507df641bed40844fb20a85a0 Mon Sep 17 00:00:00 2001 From: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:42:25 +0800 Subject: [PATCH 135/172] [TRTLLM-9416][feat] Skip DS-v3.2 indexer MQA and Top-K for short sequences. (#9524) Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> --- .../_torch/attention_backend/sparse/dsa.py | 135 +++++- tensorrt_llm/_torch/model_config.py | 6 +- .../_torch/pyexecutor/cuda_graph_runner.py | 87 +++- .../_torch/pyexecutor/model_engine.py | 63 ++- tensorrt_llm/_torch/speculative/utils.py | 2 +- tensorrt_llm/llmapi/llm_args.py | 27 ++ .../defs/accuracy/test_llm_api_pytorch.py | 53 ++- .../test_lists/qa/llm_function_core.txt | 2 + .../qa/llm_function_core_sanity.txt | 2 + .../test_lists/test-db/l0_dgx_b200.yml | 2 + .../test_lists/test-db/l0_dgx_h200.yml | 1 + .../attention/sparse/test_dsa_indexer.py | 419 +++++++++++++----- 12 files changed, 614 insertions(+), 185 deletions(-) diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py index a46752745a..904a0fb20d 100644 --- a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py +++ b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py @@ -306,6 +306,12 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): sparse_mla_topk: int # max number of draft tokens max_draft_tokens: int = 0 + # Enable indexer skip for short sequences + enable_indexer_skip: bool = False + # Whether skip the indexer for context requests + skip_indexer_for_ctx_reqs: bool = False + # Whether skip the indexer for generation requests + skip_indexer_for_gen_reqs: bool = False def __init__(self, *args, **kwargs): self.num_sms = tensorrt_llm.deep_gemm.get_num_sms() @@ -314,11 +320,12 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): self.indexer_max_chunk_size = self.sparse_attention_config.indexer_max_chunk_size else: self.indexer_max_chunk_size = 32768 # Default to 32K tokens for the indexer - self.sparse_mla_topk = self.sparse_attention_config.index_topk def __post_init__(self): super().__post_init__() + self.sparse_mla_topk = self.sparse_attention_config.index_topk + self.enable_indexer_skip = self.sparse_attention_config.skip_indexer_for_short_seqs capture_graph = torch.cuda.is_current_stream_capturing() self.indexer_k_cache_block_offsets = self.get_empty( @@ -454,6 +461,21 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): dtype=torch.int32, capture_graph=capture_graph, ) + # Topk indices buffer to support skip indexer for requests with short sequence lengths + if self.enable_indexer_skip: + self.topk_indices_buffer = self.get_empty( + self.cuda_graph_buffers, + (self.max_num_tokens, self.sparse_mla_topk), + cache_name="topk_indices_buffer", + dtype=torch.int32, + capture_graph=capture_graph, + ) + self.host_topk_indices_buffer = torch.zeros_like( + self.topk_indices_buffer, + device='cpu', + pin_memory=True, + ) + # Create expanded buffers for MTP>1 support self.create_expanded_buffers(capture_graph=capture_graph) # TODO: remove these expanded buffers when fp8_paged_mqa_logits supports MTP > 1. @@ -520,8 +542,98 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): capture_graph = torch.cuda.is_current_stream_capturing() self.create_expanded_buffers(capture_graph=capture_graph) + def prepare_dense_topk_indices(self, + kv_lens, + device=False): # device=False means use CPU + + @maybe_compile(dynamic=True) + def _get_dense_topk_indices(seq_lens, kv_lens, num_tokens): + device = kv_lens.device + past_kv_lens = kv_lens - seq_lens + # get position ids + seq_ends = torch.cumsum(seq_lens, dim=0) + seq_starts = seq_ends - seq_lens + per_seq_offsets = past_kv_lens - seq_starts # Shape: [batch_size] + global_indices = torch.arange(num_tokens, device=device) + batch_indices = torch.searchsorted(seq_ends, + global_indices, + side='right') + repeated_offsets = per_seq_offsets[batch_indices] + position_ids = global_indices + repeated_offsets + # get the dense topk indices with causal mask + range_row = torch.arange(self.sparse_mla_topk, device=device) + mask = range_row <= position_ids.unsqueeze(1) + return torch.where(mask, range_row, -1) + + if self.num_contexts > 0 and self.skip_indexer_for_ctx_reqs: + ctx_range = slice(self.num_ctx_tokens) + if device: + self.topk_indices_buffer[ctx_range, :].copy_( + _get_dense_topk_indices( + self.seq_lens_cuda[:self.num_contexts], + kv_lens[:self.num_contexts], self.num_ctx_tokens), + non_blocking=True) + else: + self.host_topk_indices_buffer[ + ctx_range, :] = _get_dense_topk_indices( + self.seq_lens[:self.num_contexts], + kv_lens[:self.num_contexts], self.num_ctx_tokens) + self.topk_indices_buffer[ctx_range, :].copy_( + self.host_topk_indices_buffer[ctx_range, :], + non_blocking=True) + + if self.num_generations > 0 and self.skip_indexer_for_gen_reqs: + gen_range = slice(self.num_ctx_tokens, self.num_tokens) + if device: + self.topk_indices_buffer[gen_range, :].copy_( + _get_dense_topk_indices( + self.seq_lens_cuda[self.num_contexts:self.num_seqs], + kv_lens[self.num_contexts:self.num_seqs], + self.num_tokens - self.num_ctx_tokens), + non_blocking=True) + else: + self.host_topk_indices_buffer[ + gen_range, :] = _get_dense_topk_indices( + self.seq_lens[self.num_contexts:self.num_seqs], + kv_lens[self.num_contexts:self.num_seqs], + self.num_tokens - self.num_ctx_tokens) + self.topk_indices_buffer[gen_range, :].copy_( + self.host_topk_indices_buffer[gen_range, :], + non_blocking=True) + def prepare(self): super().prepare() + + # Get kv lengths + assert self.kv_cache_params.use_cache is True, "DSA requires use_cache to be True" + cached_token_lens = torch.tensor( + self.kv_cache_params.num_cached_tokens_per_seq, + dtype=torch.int, + device='cpu', + ) + kv_lens = cached_token_lens + self.seq_lens_kv + + # Prepare to support skip indexer + num_extra_kv_tokens = self.kv_cache_params.num_extra_kv_tokens + if self.num_contexts > 0 and self.enable_indexer_skip: + # Minus the number of extra KV tokens because when using one-model MTP, the + # draft layers needs more KV tokens for the next draft forwards. + self.skip_indexer_for_ctx_reqs = kv_lens[:self.num_contexts].max( + ).item() <= self.sparse_mla_topk - num_extra_kv_tokens + else: + self.skip_indexer_for_ctx_reqs = False + + if self.num_generations > 0 and self.enable_indexer_skip: + # Minus the number of extra KV tokens because when using one-model MTP, the + # draft layers needs more KV tokens for the next draft forwards. + self.skip_indexer_for_gen_reqs = kv_lens[ + self.num_contexts:self.num_seqs].max().item( + ) <= self.sparse_mla_topk - num_extra_kv_tokens + else: + self.skip_indexer_for_gen_reqs = False + self.prepare_dense_topk_indices(kv_lens) + + # Build indexer_k_cache_block_offsets if self.kv_cache_manager is not None: block_ids = self.kv_cache_manager.get_batch_cache_indices( self.request_ids) @@ -560,14 +672,6 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): host_block_table, non_blocking=True) # For mla_rope_append_paged_kv_assign_q - assert self.kv_cache_params.use_cache is True, "DSA requires use_cache to be True" - cached_token_lens = torch.tensor( - self.kv_cache_params.num_cached_tokens_per_seq, - dtype=torch.int, - device='cpu', - ) - kv_lens = cached_token_lens + self.seq_lens_kv - if self.num_contexts > 0: self.num_ctx_cached_tokens = cached_token_lens[:self. num_contexts].sum( @@ -682,6 +786,7 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): tokens_per_block, self.num_sms) self.scheduler_metadata_buffer.copy_(scheduler_metadata_buffer, non_blocking=True) + self.prepare_dense_topk_indices(self.kv_lens_cuda, device=True) def update_for_spec_dec(self): super().update_for_spec_dec() @@ -1206,7 +1311,7 @@ class Indexer(nn.Module): if not use_custom_topk: topk_indices_buffer[:hidden_states.shape[0]] = -1 - if has_prefill: + if has_prefill and not metadata.skip_indexer_for_ctx_reqs: # Use chunked prefill to reduce memory footprint if metadata.indexer_prefill_chunks is not None: for chunk in metadata.indexer_prefill_chunks: @@ -1275,8 +1380,12 @@ class Indexer(nn.Module): topk_indices_buffer[:num_ctx_tokens, :topk_indices. shape[-1]] = topk_indices.to( dtype=torch.int32) + elif has_prefill and metadata.skip_indexer_for_ctx_reqs: + # Fill topk_indices_buffer with pre-defined dense topk indices + topk_indices_buffer[:num_ctx_tokens, :] = \ + metadata.topk_indices_buffer[:num_ctx_tokens, :] - if has_decode: + if has_decode and not metadata.skip_indexer_for_gen_reqs: max_seq_len = metadata.kv_cache_manager.max_seq_len # Get decode lengths per request (from seq_lens) for validation gen_seq_lens = metadata.seq_lens[num_contexts:num_contexts + @@ -1361,6 +1470,10 @@ class Indexer(nn.Module): num_gen_tokens, :topk_indices_decode. shape[-1]] = topk_indices_decode.to( dtype=torch.int32) + elif has_decode and metadata.skip_indexer_for_gen_reqs: + # Fill topk_indices_buffer with pre-defined dense topk indices + topk_indices_buffer[num_ctx_tokens:num_tokens, :] = \ + metadata.topk_indices_buffer[num_ctx_tokens:num_tokens, :] return topk_indices_buffer def _weight_scale(self, weights: torch.Tensor, diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index 148ec5e2e3..ed61109dc8 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -421,17 +421,21 @@ class ModelConfig(Generic[TConfig]): index_head_dim = sparse_attention_config.index_head_dim or pretrained_config.index_head_dim index_topk = sparse_attention_config.index_topk or pretrained_config.index_topk indexer_max_chunk_size = sparse_attention_config.indexer_max_chunk_size + skip_indexer_for_short_seqs = sparse_attention_config.skip_indexer_for_short_seqs else: index_n_heads = pretrained_config.index_n_heads index_head_dim = pretrained_config.index_head_dim index_topk = pretrained_config.index_topk indexer_max_chunk_size = None + skip_indexer_for_short_seqs = False kwargs[ 'sparse_attention_config'] = DeepSeekSparseAttentionConfig( index_n_heads=index_n_heads, index_head_dim=index_head_dim, index_topk=index_topk, - indexer_max_chunk_size=indexer_max_chunk_size) + indexer_max_chunk_size=indexer_max_chunk_size, + skip_indexer_for_short_seqs= + skip_indexer_for_short_seqs) else: raise ValueError( "checkpoint_dir is None. Cannot load model config without a valid checkpoint directory." diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py index b8e2754a9c..14bd727d9c 100644 --- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py +++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py @@ -1,11 +1,12 @@ import bisect import contextlib from dataclasses import dataclass -from typing import Any, Callable, Dict, Optional, Tuple +from typing import Any, Callable, Dict, Optional, Tuple, TypeAlias import torch -from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig +from tensorrt_llm.llmapi.llm_args import (BaseSparseAttentionConfig, + DecodingBaseConfig) from tensorrt_llm.mapping import Mapping from ...inputs.multimodal import MultimodalParams @@ -14,13 +15,17 @@ from ..expert_statistic import ExpertStatistic from ..memory_buffer_utils import get_memory_buffers from ..modules.multi_stream_utils import with_multi_stream from ..speculative.eagle3 import Eagle3ResourceManager +from ..speculative.mtp import SampleStateTensorsMTP from ..utils import make_weak_ref, piecewise_cuda_graph +from .llm_request import get_draft_token_length from .resource_manager import (BaseResourceManager, ResourceManager, ResourceManagerType) +from .sampler import SampleStateTensors from .scheduler import ScheduledRequests # A large prime number used for dummy request IDs to avoid collisions CUDA_GRAPH_DUMMY_REQUEST_ID = (1 << 64) - 1 +KeyType: TypeAlias = Tuple[int, int, bool, bool] @dataclass @@ -71,6 +76,7 @@ class CUDAGraphRunnerConfig: mapping: Optional[Mapping] dist: Optional[MPIDist] kv_cache_manager_key: Any + sparse_attention_config: Optional[BaseSparseAttentionConfig] = None class CUDAGraphRunner: @@ -93,11 +99,12 @@ class CUDAGraphRunner: self.max_supported_batch_size = config.max_cuda_graph_batch_size self.max_beam_width = config.max_beam_width self.spec_config = config.spec_config + self.sparse_config = config.sparse_attention_config - self.graphs: Dict[Tuple[int, int, int], torch.cuda.CUDAGraph] = {} - self.graph_outputs: Dict[Tuple[int, int, int], + self.graphs: Dict[KeyType, torch.cuda.CUDAGraph] = {} + self.graph_outputs: Dict[KeyType, Callable[[], Optional[torch.Tensor]]] = {} - self.graph_metadata: Dict[Tuple[int, int, int], Dict[str, Any]] = {} + self.graph_metadata: Dict[KeyType, Dict[str, Any]] = {} self.memory_pool = config.cuda_graph_mem_pool self.padding_dummy_request: Optional["Request"] = None @@ -135,17 +142,70 @@ class CUDAGraphRunner: }) for _ in range(max_total_tokens) ] + def _get_seq_len_mode( + self, + batch: ScheduledRequests, + new_tensors_device: Optional[SampleStateTensors] = None): + if self.sparse_config is not None and self.sparse_config.needs_separate_short_long_cuda_graphs( + ): + # Some sparse attention algorithms need to use different forward paths for short and long sequences. + # For example, the DSA can skip the MQA and Top-K in the indexer for short sequences to reduce the + # computational overhead. To support this feature, we need to capture separate CUDA graphs for short + # and long sequences. We need to first collect the sequence length of the requests and then determine + # the sequence length mode. For long sequences, use the default maximum sequence length. For short + # sequences, use the sequence length threshold as the maximum sequence length. + total_seq_lens = [] + new_tokens_device, next_draft_tokens_device = None, None + if new_tensors_device is not None: + new_tokens_device = new_tensors_device.new_tokens + if isinstance(new_tensors_device, SampleStateTensorsMTP): + next_draft_tokens_device = new_tensors_device.next_draft_tokens + overlap_scheduler_enabled = new_tokens_device is not None + for request in batch.generation_requests: + is_spec_request = get_draft_token_length( + request) > 0 or next_draft_tokens_device is not None + num_draft_tokens = self.spec_config.max_draft_len if is_spec_request else 0 + # First draft + if request.py_is_first_draft: + total_seq_len = len(request.get_tokens(0)) + # With overlap scheduler disabled or dummy request or not assigned to a batch, + elif not overlap_scheduler_enabled or request.is_dummy or request.py_batch_idx is None: + total_seq_len = request.max_beam_num_tokens + num_draft_tokens + # Other cases + else: + total_seq_len = request.max_beam_num_tokens + num_draft_tokens + 1 + total_seq_lens.append(total_seq_len) + # Determine the sequence length mode. + from ..speculative import get_num_extra_kv_tokens + num_extra_kv_tokens = get_num_extra_kv_tokens(self.spec_config) + max_seq_len = max(total_seq_lens) + if max_seq_len <= self.sparse_config.seq_len_threshold - num_extra_kv_tokens: + short_seq_len_mode = True + else: + short_seq_len_mode = False + else: + # For non-sparse attention or sparse attention that does not need separate short and long CUDA graphs, + # use the default sequence length mode. + short_seq_len_mode = False + return short_seq_len_mode + def get_graph_key( self, batch: ScheduledRequests, + new_tensors_device: Optional[SampleStateTensors] = None, spec_resource_manager: Optional[BaseResourceManager] = None): batch_size = batch.batch_size + + # Get the sequence length mode. + short_seq_len_mode = self._get_seq_len_mode(batch, new_tensors_device) + if self.config.is_draft_model and spec_resource_manager is not None and isinstance( spec_resource_manager, Eagle3ResourceManager): # If 'is_first_draft' is True, even with tree decoding, the length of draft_len will only be 'max_draft_len', not 'max_total_draft_token'. # Because we will pad the input to 'max_draft_len' length for the first draft layer. draft_len = self.config.original_max_draft_len if spec_resource_manager.is_first_draft else 0 - key = (batch_size, draft_len, spec_resource_manager.is_first_draft) + key = (batch_size, draft_len, spec_resource_manager.is_first_draft, + short_seq_len_mode) else: # With dynamic spec decode, the draft length maybe zero even when enable_spec_decode is True, # so we need to get the draft length from the batch instead of using enable_spec_decode. @@ -155,7 +215,7 @@ class CUDAGraphRunner: draft_len = max(draft_len_list) assert len( set(draft_len_list)) == 1, "All draft lengths must be the same" - key = (batch_size, draft_len, False) + key = (batch_size, draft_len, False, short_seq_len_mode) return key def __del__(self): @@ -168,6 +228,7 @@ class CUDAGraphRunner: attn_metadata: Any, spec_metadata: Optional[Any] = None, draft_tokens_cuda: Optional[torch.Tensor] = None, + new_tensors_device: Optional[SampleStateTensors] = None, spec_resource_manager: Optional[BaseResourceManager] = None, ) -> Tuple[Optional[Any], Optional[Any], Optional[Tuple[int, int, bool]]]: """ @@ -198,7 +259,8 @@ class CUDAGraphRunner: if not self.enabled or not can_run_cuda_graph: return None, None, None - key = self.get_graph_key(batch, spec_resource_manager) + key = self.get_graph_key(batch, new_tensors_device, + spec_resource_manager) if key in self.graphs: return self.graph_metadata[key][ @@ -220,7 +282,7 @@ class CUDAGraphRunner: graph_spec_metadata = None return graph_attn_metadata, graph_spec_metadata, key - def needs_capture(self, key: Tuple[int, int, int]): + def needs_capture(self, key: KeyType): return key not in self.graph_outputs def get_graph_pool(self): @@ -233,7 +295,7 @@ class CUDAGraphRunner: return self.memory_pool def capture(self, - key: Tuple[int, int, int], + key: KeyType, forward_fn: Callable, initial_inputs: Dict[str, Any], enable_spec_decode: bool = False, @@ -270,8 +332,7 @@ class CUDAGraphRunner: "spec_metadata": initial_inputs.get("spec_metadata", None), } - def _setup_spec_decoding_and_forward(key: Tuple[int, int, int], - forward_fn: Callable, + def _setup_spec_decoding_and_forward(key: KeyType, forward_fn: Callable, capture_inputs: Dict[str, Any]): is_first_draft = key[2] needs_kv_cache_recompute = True if enable_spec_decode and self.config.spec_config.spec_dec_mode.needs_kv_cache_recompute( @@ -302,7 +363,7 @@ class CUDAGraphRunner: self.graph_outputs[key] = make_weak_ref(output) self.memory_pool = graph.pool() - def replay(self, key: Tuple[int, int, int], + def replay(self, key: KeyType, current_inputs: Dict[str, Any]) -> Optional[torch.Tensor]: """Replays a previously captured graph.""" stored_meta = self.graph_metadata[key] diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 5da64a5569..10054bee8c 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -427,6 +427,7 @@ class PyTorchModelEngine(ModelEngine): mapping=self.mapping, dist=self.dist, kv_cache_manager_key=self.kv_cache_manager_key, + sparse_attention_config=self.sparse_attention_config, ) self.cuda_graph_runner = CUDAGraphRunner(cuda_graph_runner_config) @@ -704,31 +705,48 @@ class PyTorchModelEngine(ModelEngine): draft_lengths.append(0) draft_lengths = [self.max_total_draft_tokens] + # Create CUDA graphs for short and long sequences separately for sparse attention. + sparse_config = self.sparse_attention_config + if sparse_config is not None and sparse_config.needs_separate_short_long_cuda_graphs( + ): + # For short sequences, use the (seq_len_threshold - max_draft_len - 1) as the maximum sequence length + # to make sure all of the past and current input tokens are within the sequence length threshold. + # For long sequences, use the default maximum sequence length (self.max_seq_len). + max_seq_len = sparse_config.seq_len_threshold - ( + self.max_draft_len + 1) + if max_seq_len < self.max_seq_len: + max_seq_len_list = [self.max_seq_len, max_seq_len] + else: + max_seq_len_list = [self.max_seq_len] + else: + max_seq_len_list = [self.max_seq_len] + for bs in cuda_graph_batch_sizes: if bs > self.batch_size: continue for draft_len in draft_lengths: - warmup_request = self._create_cuda_graph_warmup_request( - resource_manager, bs, draft_len) - with self._release_batch_context(warmup_request, - resource_manager) as batch: - if batch is None: - # No KV cache space, cannot continue capturing graphs - return + for max_seq_len in max_seq_len_list: + warmup_request = self._create_cuda_graph_warmup_request( + resource_manager, bs, draft_len, max_seq_len) + with self._release_batch_context(warmup_request, + resource_manager) as batch: + if batch is None: + # No KV cache space, cannot continue capturing graphs + return - logger.info( - f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}" - ) + logger.info( + f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}, max_seq_len={max_seq_len}" + ) - self.enable_spec_decode = draft_len > 0 or self.is_draft_model - self._update_draft_inference_state_for_warmup( - batch, draft_len > 0, resource_manager) + self.enable_spec_decode = draft_len > 0 or self.is_draft_model + self._update_draft_inference_state_for_warmup( + batch, draft_len > 0, resource_manager) - self.forward(batch, - new_tensors_device=None, - resource_manager=resource_manager) - torch.cuda.synchronize() + self.forward(batch, + new_tensors_device=None, + resource_manager=resource_manager) + torch.cuda.synchronize() def _capture_piecewise_cuda_graphs(self, resource_manager: ResourceManager): """Captures piecewise CUDA graphs for context/prefill steps via torch.compile.""" @@ -873,8 +891,11 @@ class PyTorchModelEngine(ModelEngine): return result def _create_cuda_graph_warmup_request( - self, resource_manager: ResourceManager, batch_size: int, - draft_len: int) -> Optional[ScheduledRequests]: + self, + resource_manager: ResourceManager, + batch_size: int, + draft_len: int, + max_seq_len: int = None) -> Optional[ScheduledRequests]: """Creates a dummy ScheduledRequests tailored for CUDA graph capture.""" kv_cache_manager = resource_manager.get_resource_manager( self.kv_cache_manager_key) @@ -902,7 +923,8 @@ class PyTorchModelEngine(ModelEngine): available_tokens = kv_cache_manager.get_num_available_tokens(draft_len) # Add one dummy request with the maximum possible sequence length. - token_num = max(1, min(available_tokens, self.max_seq_len - 1)) + max_seq_len = self.max_seq_len if max_seq_len is None else max_seq_len + token_num = max(1, min(available_tokens, max_seq_len - 1)) model_config = self.model.model_config.pretrained_config max_position_embeddings = getattr(model_config, 'max_position_embeddings', None) @@ -2693,6 +2715,7 @@ class PyTorchModelEngine(ModelEngine): spec_metadata=spec_metadata, draft_tokens_cuda=self.draft_tokens_cuda if self.is_spec_decode else None, + new_tensors_device=new_tensors_device, spec_resource_manager=spec_resource_manager, ) can_run_graph = key is not None diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py index 139787df44..6a22ad19bd 100644 --- a/tensorrt_llm/_torch/speculative/utils.py +++ b/tensorrt_llm/_torch/speculative/utils.py @@ -237,7 +237,7 @@ def get_num_extra_kv_tokens(spec_config): """ if spec_config is None: return 0 - if spec_config.spec_dec_mode.is_eagle3_one_model(): + if spec_config.spec_dec_mode.use_one_engine(): return spec_config.max_draft_len - 1 return 0 diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index b790dc141d..c2d5f23f50 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -188,6 +188,11 @@ class BaseSparseAttentionConfig(StrictBaseModel): """ Configuration for sparse attention. """ + seq_len_threshold: Optional[int] = Field( + default=None, + description= + "The sequence length threshold for separating short and long sequences." + ) @classmethod def from_dict(cls, data: dict): @@ -223,6 +228,15 @@ class BaseSparseAttentionConfig(StrictBaseModel): def get_indices_block_size(self) -> int: return 1 + def needs_separate_short_long_cuda_graphs(self) -> bool: + """ + Determines whether to capture a dedicated CUDA graph for batches consisting entirely of short sequences. + If True, capture distinct graphs for short-only batches and general cases (e.g., long or mixed batches). + If False, capture a single unified CUDA graph for all sequences regardless of length. + The seq_len_threshold parameter defines the cutoff boundary between short and long sequences. + """ + return False + class RocketSparseAttentionConfig(BaseSparseAttentionConfig): """ @@ -268,6 +282,11 @@ class DeepSeekSparseAttentionConfig(BaseSparseAttentionConfig): description="The topk for the indexer.") indexer_max_chunk_size: Optional[int] = Field( default=None, description="The maximum chunk size for the indexer.") + # TODO: enable this by default once the memory usage in attention metadata is optimized + skip_indexer_for_short_seqs: bool = Field( + default=False, + description= + "Whether to skip the MQA and Top-K in the indexer for short sequences.") @classmethod def from_dict(cls, data: dict): @@ -276,6 +295,14 @@ class DeepSeekSparseAttentionConfig(BaseSparseAttentionConfig): def supports_backend(self, backend: str) -> bool: return backend == "pytorch" + def needs_separate_short_long_cuda_graphs(self) -> bool: + """ + Whether to capture separate CUDA graphs for short and long sequences. + Use seq_len_threshold to determine the threshold for separating short and long sequences. + """ + self.seq_len_threshold = self.index_topk + return self.skip_indexer_for_short_seqs + class MoeLoadBalancerConfig(StrictBaseModel): """ diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index af8dd5073f..483dff0598 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -51,6 +51,7 @@ from tensorrt_llm._torch.model_config import MoeLoadBalancerConfig from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig, + DeepSeekSparseAttentionConfig, EagleDecodingConfig, KvCacheConfig, MoeConfig, MTPDecodingConfig, NGramDecodingConfig, RocketSparseAttentionConfig, SamplingParams, @@ -2651,21 +2652,22 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): @skip_pre_hopper @pytest.mark.skip_less_device_memory(140000) @pytest.mark.parametrize( - "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", + "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend,skip_indexer", [ - (8, 1, 8, 0, False, True, True, True, 24, "_DEFAULT"), - (8, 1, 8, 1, False, True, True, True, 24, "_DEFAULT"), - (8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT"), - (8, 1, 8, 3, False, False, True, True, 1, "TRTLLM"), - (8, 1, 8, 3, False, False, True, True, 1, "_DEFAULT"), + (8, 1, 8, 0, False, True, True, True, 24, "_DEFAULT", False), + (8, 1, 8, 1, False, True, True, True, 24, "_DEFAULT", False), + (8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT", False), + (8, 1, 8, 3, False, False, True, True, 1, "TRTLLM", False), + (8, 1, 8, 3, False, False, True, True, 1, "_DEFAULT", False), + (8, 1, 8, 1, False, True, True, True, 24, "_DEFAULT", True), ], ids=[ "baseline", "baseline_mtp1", "baseline_fp8kv", "latency", - "latency_default" + "latency_default", "skip_indexer" ]) def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, - max_batch_size, moe_backend): + max_batch_size, moe_backend, skip_indexer): if get_sm_version() == 100 or get_sm_version() == 103: moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) @@ -2691,6 +2693,11 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): ) kv_cache_config.dtype = "fp8" + dsa_config = None + if skip_indexer: + dsa_config = DeepSeekSparseAttentionConfig( + skip_indexer_for_short_seqs=True) + mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -2702,7 +2709,8 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): kv_cache_config=kv_cache_config, **pytorch_config, enable_attention_dp=attention_dp, - speculative_config=mtp_config) as llm: + speculative_config=mtp_config, + sparse_attention_config=dsa_config) as llm: # GPQA Diamond takes too long to run, we enable it only for fp8kv. if fp8kv: @@ -2721,17 +2729,21 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(8) @skip_pre_blackwell @pytest.mark.parametrize( - "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", + "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend,skip_indexer", [ - (8, 1, 8, 0, False, True, True, True, 24, "CUTLASS"), - (8, 1, 8, 1, False, True, True, True, 24, "CUTLASS"), - (8, 1, 8, 0, True, True, True, True, 24, "CUTLASS"), - (8, 1, 8, 3, False, False, True, True, 1, "TRTLLM"), + (8, 1, 8, 0, False, True, True, True, 24, "CUTLASS", False), + (8, 1, 8, 1, False, True, True, True, 24, "CUTLASS", False), + (8, 1, 8, 0, True, True, True, True, 24, "CUTLASS", False), + (8, 1, 8, 3, False, False, True, True, 1, "TRTLLM", False), + (8, 1, 8, 1, False, True, True, True, 24, "CUTLASS", True), ], - ids=["baseline", "baseline_mtp1", "baseline_fp8kv", "latency"]) + ids=[ + "baseline", "baseline_mtp1", "baseline_fp8kv", "latency", + "skip_indexer" + ]) def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, - max_batch_size, moe_backend): + max_batch_size, moe_backend, skip_indexer): if moe_backend == "TRTLLM" and (get_sm_version() == 120 or get_sm_version() == 121): pytest.skip( @@ -2751,6 +2763,12 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): if fp8kv: kv_cache_config.dtype = "fp8" + + dsa_config = None + if skip_indexer: + dsa_config = DeepSeekSparseAttentionConfig( + skip_indexer_for_short_seqs=True) + mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -2762,7 +2780,8 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): kv_cache_config=kv_cache_config, **pytorch_config, enable_attention_dp=attention_dp, - speculative_config=mtp_config) as llm: + speculative_config=mtp_config, + sparse_attention_config=dsa_config) as llm: # GPQA Diamond takes too long to run, we enable it only for fp8kv. if fp8kv: diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index eab8fea284..303fa1b0d4 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -500,10 +500,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 228d748e45..91474ba213 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -52,10 +52,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index a9dd9b993e..382dd13553 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -95,8 +95,10 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90) - condition: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index c4d42214ae..55d42b7a3f 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -19,6 +19,7 @@ l0_dgx_h200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] # 1h - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency_default] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] - accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] - accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] diff --git a/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py b/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py index 8df479b6ff..cccef4acc6 100644 --- a/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py +++ b/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py @@ -18,8 +18,8 @@ from tensorrt_llm import deep_gemm from tensorrt_llm._torch.attention_backend.interface import ( PositionalEmbeddingParams, RopeParams) from tensorrt_llm._torch.attention_backend.sparse.dsa import ( - DSACacheManager, Indexer, compute_cu_seqlen_kv_bounds_with_cache, - split_prefill_chunks) + DSACacheManager, DSAtrtllmAttentionMetadata, Indexer, + compute_cu_seqlen_kv_bounds_with_cache, split_prefill_chunks) from tensorrt_llm.bindings import DataType from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.bindings.internal.batch_manager import \ @@ -383,7 +383,9 @@ def _create_mock_metadata(request_ids, num_tokens, indexer_max_chunk_size=8194, max_draft_tokens=0, - enable_context_mla_with_cached_kv=False): + enable_context_mla_with_cached_kv=False, + index_topk=2048, + enable_indexer_skip=False): """Helper to create mock metadata for testing.""" class MockKVCacheParams: @@ -391,14 +393,17 @@ def _create_mock_metadata(request_ids, def __init__(self): self.num_cached_tokens_per_seq = num_cached_tokens - class MockMetadata: + class MockMetadata(DSAtrtllmAttentionMetadata): def __init__(self): self.num_sms = deep_gemm.get_num_sms() self.request_ids = request_ids self.num_contexts = num_contexts self.num_generations = num_generations + self._num_seqs = num_contexts + num_generations self.max_draft_tokens = max_draft_tokens + self.sparse_mla_topk = index_topk + self.enable_indexer_skip = enable_indexer_skip # Keep seq_lens on CPU for split_prefill_chunks and other CPU operations # CUDA kernels will convert to CUDA as needed self.seq_lens = seq_lens.cpu() if seq_lens.is_cuda else seq_lens @@ -465,8 +470,9 @@ def _create_mock_metadata(request_ids, device='cpu', pin_memory=True, dtype=torch.int64) - self.num_ctx_tokens = num_ctx_tokens - self.num_tokens = num_tokens + self._num_ctx_tokens = num_ctx_tokens + self._num_tokens = num_tokens + self.num_gen_tokens = num_tokens - num_ctx_tokens # Also set private attributes used by DSAtrtllmAttentionMetadata self._num_contexts = num_contexts self._num_generations = num_generations @@ -509,9 +515,117 @@ def _create_mock_metadata(request_ids, self.runtime_features = RuntimeFeatures() + # Add expanded buffers for MTP>1 support + self.kv_lens_expanded_cuda = torch.zeros( + (self.num_seqs * (1 + self.max_draft_tokens), ), + device='cuda', + dtype=torch.int32) + self.kv_lens_expanded_host = torch.zeros_like( + self.kv_lens_expanded_cuda, device='cpu', pin_memory=True) + self.block_table_expanded = torch.zeros( + (self.num_seqs * (1 + self.max_draft_tokens), + self.kv_cache_manager.max_blocks_per_seq), + device='cuda', + dtype=torch.int32) + self.host_block_table_expanded = torch.zeros_like( + self.block_table_expanded, device='cpu', pin_memory=True) + self.scheduler_metadata_buffer_expanded = torch.zeros( + (self.num_sms + 1, 2), device='cuda', dtype=torch.int32) + if self.max_draft_tokens > 1: + gen_kv_lens = kv_lens[num_contexts:self.num_seqs] + gen_kv_lens_expanded = torch.stack([gen_kv_lens] * + (1 + self.max_draft_tokens), + dim=0) + gen_kv_lens_expanded = gen_kv_lens_expanded.transpose( + 0, 1).contiguous().flatten() + self.kv_lens_expanded_host[:self.num_gen_tokens].copy_( + gen_kv_lens_expanded) + self.kv_lens_expanded_cuda[:self.num_gen_tokens].copy_( + self.kv_lens_expanded_host[:self.num_gen_tokens], + non_blocking=True) + + if self.kv_cache_manager is not None: + block_ids = self.kv_cache_manager.get_batch_cache_indices( + self.request_ids) + gen_block_ids = block_ids[self.num_contexts:] + if len(gen_block_ids) > 0: + # Find max length and create padded tensor + max_len = max(len(bid) for bid in gen_block_ids) + gen_block_tensor = self.host_indexer_k_cache_block_offsets[ + self.num_contexts:self.num_seqs, :max_len] + expanded_blocks = gen_block_tensor.repeat_interleave( + 1 + self.max_draft_tokens, dim=0) + self.host_block_table_expanded[:self.num_gen_tokens, : + max_len].copy_( + expanded_blocks, + non_blocking=True) + self.block_table_expanded[:self.num_gen_tokens].copy_( + self.host_block_table_expanded[:self. + num_gen_tokens], + non_blocking=True) + + # Add skip indexer attributes + self.topk_indices_buffer = torch.zeros( + (num_tokens, self.sparse_mla_topk), + device='cuda', + dtype=torch.int32) + + if self.num_contexts > 0 and self.enable_indexer_skip: + self.skip_indexer_for_ctx_reqs = kv_lens[:self.num_contexts].max( + ).item() <= self.sparse_mla_topk + else: + self.skip_indexer_for_ctx_reqs = False + + if self.num_generations > 0 and self.enable_indexer_skip: + self.max_draft_tokens + 1 + self.skip_indexer_for_gen_reqs = kv_lens[ + self.num_contexts:self.num_seqs].max().item( + ) <= self.sparse_mla_topk + else: + self.skip_indexer_for_gen_reqs = False + self.prepare_dense_topk_indices(self.kv_lens_cuda_runtime, + device=True) + + @property + def num_seqs(self) -> int: + """ + The number of sequences in the batch. + """ + return self._num_seqs + return MockMetadata() +def validate_topk_indices(topk_indices_0, topk_indices_1, total_tokens): + """ + Validate the similarity between two topk indices. + """ + num_exact_matches = 0 + total_similarity = 0.0 + min_similarity = 1.0 + + for token_idx in range(total_tokens): + valid_0 = topk_indices_0[token_idx][topk_indices_0[token_idx] != -1] + valid_1 = topk_indices_1[token_idx][topk_indices_1[token_idx] != -1] + + if torch.equal(valid_0, valid_1): + num_exact_matches += 1 + similarity = 1.0 + total_similarity += similarity + else: + valid_0_set = set(valid_0.cpu().tolist()) + valid_1_set = set(valid_1.cpu().tolist()) + intersection = len(valid_0_set & valid_1_set) + union = len(valid_0_set | valid_1_set) + similarity = intersection / union if union > 0 else 0.0 + total_similarity += similarity + + # Track min similarity + min_similarity = min(min_similarity, similarity) + + return num_exact_matches, total_similarity, min_similarity + + @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available") @skip_pre_hopper def test_indexer_k_cache_scatter_custom_op(): @@ -771,7 +885,7 @@ def test_fp8_k_cache_roundtrip(): @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available") @skip_pre_hopper -@pytest.mark.parametrize("batch_size,next_n", [(4, 1), (2, 2)]) +@pytest.mark.parametrize("batch_size,next_n", [(4, 1), (2, 2), (4, 4)]) def test_indexer_decode_with_paged_kv_cache(batch_size, next_n): """ Test FP8 paged KV cache with two-phase workflow and variable context lengths. @@ -899,11 +1013,21 @@ def test_indexer_decode_with_paged_kv_cache(batch_size, next_n): kv_cache_fp8_pool = cache_manager.get_indexer_k_cache_buffers(layer_idx) q_fp8 = q.to(torch.float8_e4m3fn) - logits = fp8_paged_mqa_logits( - q_fp8, kv_cache_fp8_pool, weights, - metadata_gen.kv_lens_cuda_runtime[0:batch_size], - metadata_gen.indexer_k_cache_block_offsets, - metadata_gen.scheduler_metadata_buffer, max_model_len) + if next_n <= 2: + q_fp8 = q_fp8 + context_lens = metadata_gen.kv_lens_cuda_runtime[0:batch_size] + block_table = metadata_gen.indexer_k_cache_block_offsets[0:batch_size] + scheduler_metadata_buffer = metadata_gen.scheduler_metadata_buffer + else: + q_fp8 = q_fp8.view(-1, 1, *q_fp8.shape[2:]) + num_tokens = batch_size * next_n + context_lens = metadata_gen.kv_lens_expanded_cuda[:num_tokens] + block_table = metadata_gen.block_table_expanded[:num_tokens] + scheduler_metadata_buffer = metadata_gen.scheduler_metadata_buffer_expanded + + logits = fp8_paged_mqa_logits(q_fp8, kv_cache_fp8_pool, weights, + context_lens, block_table, + scheduler_metadata_buffer, max_model_len) print(f"✓ Kernel output shape: {logits.shape}") # Reference: Reconstruct BF16 cache from original values @@ -1568,9 +1692,9 @@ def test_indexer_chunked_prefill(chunk_size, seq_lens_list, chunking_type): @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available") @skip_pre_hopper @pytest.mark.parametrize("batch_size", [1, 16, 64]) -@pytest.mark.parametrize("next_n", [1, 2]) +@pytest.mark.parametrize("next_n", [1, 2, 4]) @pytest.mark.parametrize("index_topk", [2048]) -@pytest.mark.parametrize("seq_len_range", [(2048, 8192)]) +@pytest.mark.parametrize("seq_len_range", [(2048, 8192), (512, 1024)]) def test_indexer_decode_custom_vs_fallback(batch_size, next_n, index_topk, seq_len_range): """ @@ -1587,6 +1711,7 @@ def test_indexer_decode_custom_vs_fallback(batch_size, next_n, index_topk, - Different batch sizes - Different next_n values (1, 2, 4 for speculative decode) - Variable sequence lengths (90% >= 2048 to test realistic long sequences) + - Short sequences (512, 1024) to test the indexer skip functionality """ torch.manual_seed(42) random.seed(42) @@ -1597,31 +1722,38 @@ def test_indexer_decode_custom_vs_fallback(batch_size, next_n, index_topk, max_model_len = 16384 layer_idx = 0 min_seq_len, max_seq_len = seq_len_range + enable_indexer_skip = max_seq_len <= 2048 - # Generate KV cache lengths (90% >= 2048 to test realistic scenarios) - kv_lens = torch.zeros(batch_size, dtype=torch.int32) - is_long = torch.rand(batch_size) < 0.9 + # Generate KV cache lengths + if enable_indexer_skip: + kv_lens = torch.randint(min_seq_len, + max_seq_len, (batch_size, ), + dtype=torch.int32) + else: + # (90% >= 2048 to test realistic scenarios) + kv_lens = torch.zeros(batch_size, dtype=torch.int32) + is_long = torch.rand(batch_size) < 0.9 - num_long = is_long.sum().item() - if num_long > 0: - long_min = max(2048, min_seq_len) - long_max = max(long_min + 1, max_seq_len) - kv_lens[is_long] = torch.randint(long_min, - long_max, (num_long, ), - dtype=torch.int32) + num_long = is_long.sum().item() + if num_long > 0: + long_min = max(2048, min_seq_len) + long_max = max(long_min + 1, max_seq_len) + kv_lens[is_long] = torch.randint(long_min, + long_max, (num_long, ), + dtype=torch.int32) - num_short = (~is_long).sum().item() - if num_short > 0: - short_max = min(2048, max_seq_len) - if short_max > min_seq_len: - kv_lens[~is_long] = torch.randint(min_seq_len, - short_max, (num_short, ), - dtype=torch.int32) - else: - kv_lens[~is_long] = torch.randint(max(2048, min_seq_len), - max(2049, max_seq_len), - (num_short, ), - dtype=torch.int32) + num_short = (~is_long).sum().item() + if num_short > 0: + short_max = min(2048, max_seq_len) + if short_max > min_seq_len: + kv_lens[~is_long] = torch.randint(min_seq_len, + short_max, (num_short, ), + dtype=torch.int32) + else: + kv_lens[~is_long] = torch.randint(max(2048, min_seq_len), + max(2049, max_seq_len), + (num_short, ), + dtype=torch.int32) seq_lens = torch.full((batch_size, ), next_n, dtype=torch.int32) num_gen_tokens = batch_size * next_n @@ -1754,35 +1886,58 @@ def test_indexer_decode_custom_vs_fallback(batch_size, next_n, index_topk, weights, use_custom_topk=False) + # Test with indexer skip enabled + if enable_indexer_skip: + metadata_skip = _create_mock_metadata(request_ids, + batch_size, + 0, + batch_size, + seq_lens.clone(), + final_lens.clone(), + num_cached_tokens, + cache_manager, + 0, + num_gen_tokens, + max_model_len, + max_draft_tokens=next_n - 1, + enable_indexer_skip=True) + + Indexer.prepare(metadata_skip) + indexer._update_k_cache(k_fp8, k_scale, metadata_skip) + + try: + topk_indices_skip = indexer.sparse_attn_indexer( + metadata_skip, + hidden_states, + q_fp8, + k_fp8, + k_scale, + weights, + use_custom_topk=True) + except Exception as e: + raise RuntimeError(f"Error when testing indexer skip: {e}") + # Validation + ## Custom vs fallback num_ctx_tokens = 0 custom_decode = topk_indices_custom[num_ctx_tokens:num_ctx_tokens + num_gen_tokens, :] fallback_decode = topk_indices_fallback[num_ctx_tokens:num_ctx_tokens + num_gen_tokens, :] - - num_exact_matches = 0 - total_similarity = 0.0 - - for token_idx in range(num_gen_tokens): - custom_valid = custom_decode[token_idx][custom_decode[token_idx] != -1] - fallback_valid = fallback_decode[token_idx][fallback_decode[token_idx] - != -1] - - if torch.equal(custom_valid, fallback_valid): - num_exact_matches += 1 - total_similarity += 1.0 - elif custom_valid.shape[0] > 0 or fallback_valid.shape[0] > 0: - custom_set = set(custom_valid.cpu().tolist()) - fallback_set = set(fallback_valid.cpu().tolist()) - intersection = len(custom_set & fallback_set) - union = len(custom_set | fallback_set) - total_similarity += intersection / union if union > 0 else 0.0 - + num_exact_matches, total_similarity, _ = validate_topk_indices( + custom_decode, fallback_decode, num_gen_tokens) avg_similarity = total_similarity / num_gen_tokens - assert avg_similarity >= 0.95, \ f"Decode custom vs fallback differ: avg similarity {avg_similarity:.4f} < 0.95" + ## Custom vs skip + if enable_indexer_skip: + skip_decode = topk_indices_skip[num_ctx_tokens:num_ctx_tokens + + num_gen_tokens, :] + num_exact_matches, total_similarity, _ = validate_topk_indices( + custom_decode, skip_decode, num_gen_tokens) + avg_similarity = total_similarity / num_gen_tokens + assert avg_similarity >= 0.95, \ + f"Decode custom vs skip differ: avg similarity {avg_similarity:.4f} < 0.95" @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available") @@ -1895,27 +2050,9 @@ def test_indexer_prefill_chunked_custom_vs_fallback(batch_size, index_topk, use_custom_topk=False) # Validation - num_exact_matches = 0 - total_similarity = 0.0 - - for token_idx in range(total_tokens): - custom_valid = topk_indices_custom[token_idx][ - topk_indices_custom[token_idx] != -1] - fallback_valid = topk_indices_fallback[token_idx][ - topk_indices_fallback[token_idx] != -1] - - if torch.equal(custom_valid, fallback_valid): - num_exact_matches += 1 - total_similarity += 1.0 - elif custom_valid.shape[0] > 0 or fallback_valid.shape[0] > 0: - custom_set = set(custom_valid.cpu().tolist()) - fallback_set = set(fallback_valid.cpu().tolist()) - intersection = len(custom_set & fallback_set) - union = len(custom_set | fallback_set) - total_similarity += intersection / union if union > 0 else 0.0 - + num_exact_matches, total_similarity, _ = validate_topk_indices( + topk_indices_custom, topk_indices_fallback, total_tokens) avg_similarity = total_similarity / total_tokens - assert avg_similarity >= 0.95, \ f"Chunked prefill differ: avg similarity {avg_similarity:.4f} < 0.95" @@ -1940,11 +2077,13 @@ def test_indexer_prefill_single_pass_custom_vs_fallback(batch_size, index_topk, layer_idx = 0 min_seq_len, max_seq_len = seq_len_range + # Generate variable context lengths per sequence seq_lens = torch.randint(min_seq_len, max_seq_len, (batch_size, ), dtype=torch.int32) total_tokens = seq_lens.sum().item() + # Create cache manager and indexer cache_manager, sparse_attn_config = create_dsa_cache_manager( batch_size=batch_size, head_dim=head_dim, @@ -1960,6 +2099,7 @@ def test_indexer_prefill_single_pass_custom_vs_fallback(batch_size, index_topk, is_gen=False, prepare_resource=True) + # Generate test data q = torch.randn((total_tokens, heads, head_dim), device="cuda", dtype=torch.bfloat16) @@ -2020,34 +2160,51 @@ def test_indexer_prefill_single_pass_custom_vs_fallback(batch_size, index_topk, weights, use_custom_topk=False) + # Test with indexer skip enabled + metadata_skip = _create_mock_metadata(request_ids, + batch_size, + batch_size, + 0, + seq_lens.clone(), + seq_lens.clone(), [0] * batch_size, + cache_manager, + total_tokens, + total_tokens, + max_model_len, + enable_indexer_skip=True) + Indexer.prepare(metadata_skip) + indexer._update_k_cache(k_fp8, k_scale, metadata_skip) + metadata_skip.indexer_prefill_chunks = None + + try: + topk_indices_skip = indexer.sparse_attn_indexer(metadata_skip, + hidden_states, + q_fp8, + k_fp8, + k_scale, + weights, + use_custom_topk=True) + except Exception as e: + raise RuntimeError(f"Indexer skip not available: {e}") + # Validation - num_exact_matches = 0 - total_similarity = 0.0 - - for token_idx in range(total_tokens): - custom_valid = topk_indices_custom[token_idx][ - topk_indices_custom[token_idx] != -1] - fallback_valid = topk_indices_fallback[token_idx][ - topk_indices_fallback[token_idx] != -1] - - if torch.equal(custom_valid, fallback_valid): - num_exact_matches += 1 - total_similarity += 1.0 - else: - custom_set = set(custom_valid.cpu().tolist()) - fallback_set = set(fallback_valid.cpu().tolist()) - intersection = len(custom_set & fallback_set) - union = len(custom_set | fallback_set) - total_similarity += intersection / union if union > 0 else 0.0 - + ## Custom vs fallback + num_exact_matches, total_similarity, _ = validate_topk_indices( + topk_indices_custom, topk_indices_fallback, total_tokens) + avg_similarity = total_similarity / total_tokens + assert avg_similarity >= 0.95, \ + f"Single-pass prefill differ: avg similarity {avg_similarity:.4f} < 0.95" + ## Custom vs skip + num_exact_matches, total_similarity, _ = validate_topk_indices( + topk_indices_custom, topk_indices_skip, total_tokens) avg_similarity = total_similarity / total_tokens - assert avg_similarity >= 0.95, \ f"Single-pass prefill differ: avg similarity {avg_similarity:.4f} < 0.95" @skip_pre_hopper -def test_indexer_topk_multi_request_with_different_cache(): +@pytest.mark.parametrize("enable_indexer_skip", [True, False]) +def test_indexer_topk_multi_request_with_different_cache(enable_indexer_skip): """ Test that custom topk kernel handles multi-request batches with different cached amounts. """ @@ -2063,7 +2220,10 @@ def test_indexer_topk_multi_request_with_different_cache(): # Critical: different cached amounts seq_lens = [256, 237] # NEW tokens - cached_tokens = [0, 3584] # Req0: no cache, Req1: large cache + if enable_indexer_skip: + cached_tokens = [256, 584] # Req0: no cache, Req1: short cache + else: + cached_tokens = [0, 3584] # Req0: no cache, Req1: large cache total_kv_lens = [seq_lens[i] + cached_tokens[i] for i in range(batch_size)] total_tokens = sum(seq_lens) @@ -2145,6 +2305,32 @@ def test_indexer_topk_multi_request_with_different_cache(): weights, use_custom_topk=False) + # Test with indexer skip enabled + if enable_indexer_skip: + metadata_skip = _create_mock_metadata( + request_ids, + batch_size, + batch_size, + 0, + torch.tensor(seq_lens, dtype=torch.int32), + torch.tensor(total_kv_lens, dtype=torch.int32), + cached_tokens, + cache_manager, + total_tokens, + total_tokens, + indexer_max_chunk_size=32768, + enable_context_mla_with_cached_kv=True, + enable_indexer_skip=True) + Indexer.prepare(metadata_skip) + indexer._update_k_cache(k_fp8, k_scale, metadata_skip) + topk_indices_skip = indexer.sparse_attn_indexer(metadata_skip, + hidden_states, + q_fp8, + k_fp8, + k_scale, + weights, + use_custom_topk=True) + # Validate: custom and fallback should match print(f"\n=== Validation ===") @@ -2190,34 +2376,23 @@ def test_indexer_topk_multi_request_with_different_cache(): print(f" ✓ All large-window tokens have {index_topk} valid indices") # Validation - num_exact_matches = 0 - total_similarity = 0.0 - min_similarity = 1.0 - - for token_idx in range(total_tokens): - custom_valid = topk_custom[token_idx][topk_custom[token_idx] >= 0] - fallback_valid = topk_fallback[token_idx][topk_fallback[token_idx] >= 0] - - if torch.equal(custom_valid, fallback_valid): - num_exact_matches += 1 - similarity = 1.0 - total_similarity += similarity - else: - custom_set = set(custom_valid.cpu().tolist()) - fallback_set = set(fallback_valid.cpu().tolist()) - intersection = len(custom_set & fallback_set) - union = len(custom_set | fallback_set) - similarity = intersection / union if union > 0 else 0.0 - total_similarity += similarity - - # Track min similarity - min_similarity = min(min_similarity, similarity) - + num_exact_matches, total_similarity, min_similarity = validate_topk_indices( + topk_custom, topk_fallback, total_tokens) avg_similarity = total_similarity / total_tokens - print(f" Exact matches: {num_exact_matches}/{total_tokens}") print( f" Similarity - Min: {min_similarity:.4f}, Avg: {avg_similarity:.4f}") assert avg_similarity >= 0.95, \ f"Custom vs fallback differ: avg similarity {avg_similarity:.4f} < 0.95" + + if enable_indexer_skip: + num_exact_matches, total_similarity, min_similarity = validate_topk_indices( + topk_custom, topk_indices_skip, total_tokens) + avg_similarity = total_similarity / total_tokens + print(f" Exact matches: {num_exact_matches}/{total_tokens}") + print( + f" Similarity - Min: {min_similarity:.4f}, Avg: {avg_similarity:.4f}" + ) + assert avg_similarity >= 0.95, \ + f"Custom vs indexer skip differ: avg similarity {avg_similarity:.4f} < 0.95" From dfc879935233b06a0bddcb8d02416b4a0bbcc925 Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Sun, 14 Dec 2025 21:23:59 -0800 Subject: [PATCH 136/172] [https://nvbugs/5669114][fix] Switch to MMMU benchmark for Gemma3 27B (#9966) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- .../defs/accuracy/references/mmmu.yaml | 2 + .../test_llm_api_pytorch_multimodal.py | 31 +++++++++ tests/integration/defs/test_e2e.py | 63 +------------------ .../test_lists/qa/llm_function_core.txt | 5 +- .../qa/llm_function_core_sanity.txt | 3 +- .../test_lists/qa/llm_function_nim.txt | 5 +- .../test_lists/test-db/l0_h100.yml | 2 +- 7 files changed, 39 insertions(+), 72 deletions(-) diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 9e094af96e..6a8cc12d00 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -1,3 +1,5 @@ +google/gemma-3-27b-it: + - accuracy: 52.0 Qwen/Qwen2-VL-7B-Instruct: - accuracy: 48.44 Qwen/Qwen2.5-VL-7B-Instruct: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index ccb66ddd29..194715ed29 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -214,3 +214,34 @@ class TestPhi4MMFusedVisionLora(LlmapiAccuracyTestHarness): ) as llm: task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) + + +class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness): + MODEL_NAME = "google/gemma-3-27b-it" + MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-27b-it/" + MAX_NUM_TOKENS = 25600 + + sampling_params = SamplingParams( + max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="" + ) + + # Gemma3 VLM needs KV cache reuse disabled for custom mask support. + kv_cache_config = KvCacheConfig( + enable_block_reuse=False, + enable_partial_reuse=False, + free_gpu_memory_fraction=0.6, + ) + + def test_auto_dtype(self): + # Gemma3 VLM needs FlashInfer attention backend for custom mask support. + with LLM( + self.MODEL_PATH, + max_batch_size=16, + max_num_tokens=self.MAX_NUM_TOKENS, + max_seq_len=8704, # 8192 + 512. + kv_cache_config=self.kv_cache_config, + attn_backend="FLASHINFER", + enable_chunked_prefill=False, + ) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 11dbcbd822..840e856b29 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2490,10 +2490,6 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv): pytest.param("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503", marks=pytest.mark.skip_less_device_memory(80000)), - pytest.param("gemma-3-27b-it", - "gemma/gemma-3-27b-it", - marks=(pytest.mark.skip_less_device_memory(80000), - skip_post_blackwell)), pytest.param( "Nano-v2-VLM", "Nano-v2-VLM", @@ -2574,26 +2570,9 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, ] if use_cuda_graph: cmd.append("--use_cuda_graph") - # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently. - # Custom mask involves bidirectional masking of image tokens in context phase. To get this - # correct, chunked prefill and kv cache reuse need to be turned off. - if model_name == "gemma-3-27b-it": - cmd.append("--image_format=pil") - cmd.append("--attention_backend=FLASHINFER") - cmd.append("--disable_kv_cache_reuse") - cmd.append("--kv_cache_fraction=0.5") - cmd.append("--max_seq_len=1024") output = llm_venv.run_cmd(cmd, caller=check_output) - # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky. - if model_name == "gemma-3-27b-it": - print( - f"Skipping keyword matching test for {model_name}. Smoke test completed successfully." - ) - print("output:", output) - return - match_ratio = 4.0 / 5 parsed_outputs = parse_output(output) for prompt_output, prompt_keywords in zip( @@ -2860,8 +2839,6 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, model_name, @pytest.mark.skip_less_device(2) @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("model_name,model_path", [ - pytest.param( - "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell), ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), ]) def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, @@ -2915,29 +2892,12 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, ] # Add model-specific configurations - if model_name == "gemma-3-27b-it": - # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently. - # Custom mask involves bidirectional masking of image tokens in context phase. To get this - # correct, chunked prefill and kv cache reuse need to be turned off. - cmd.append("--image_format=pil") - cmd.append("--attention_backend=FLASHINFER") - cmd.append("--disable_kv_cache_reuse") - cmd.append("--kv_cache_fraction=0.5") - cmd.append("--max_seq_len=1024") - elif model_name == "mistral-small-3.1-24b-instruct": + if model_name == "mistral-small-3.1-24b-instruct": # TODO: remove this once kv cache reuse is supported for Mistral cmd.append("--disable_kv_cache_reuse") output = llm_venv.run_cmd(cmd, caller=check_output) - # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky. - if model_name == "gemma-3-27b-it": - print( - f"Skipping keyword matching test for {model_name}. Smoke test completed successfully." - ) - print("output:", output) - return - # Set match ratio based on model match_ratio = 4.0 / 5 @@ -2957,8 +2917,6 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("model_name,model_path", [ ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), - pytest.param( - "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell), ]) def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, model_path): @@ -3008,30 +2966,13 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, ] # Add model-specific configurations - if model_name == "gemma-3-27b-it": - # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently. - # Custom mask involves bidirectional masking of image tokens in context phase. To get this - # correct, chunked prefill and kv cache reuse need to be turned off. - cmd.append("--image_format=pil") - cmd.append("--attention_backend=FLASHINFER") - cmd.append("--disable_kv_cache_reuse") - cmd.append("--kv_cache_fraction=0.5") - cmd.append("--max_seq_len=1024") - - elif model_name == "mistral-small-3.1-24b-instruct": + if model_name == "mistral-small-3.1-24b-instruct": # TODO: remove this once kv cache reuse is supported for Mistral cmd.append("--disable_kv_cache_reuse") output = llm_venv.run_cmd(cmd, caller=check_output) print("output:", output) - # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky. - if model_name == "gemma-3-27b-it": - print( - f"Skipping keyword matching test for {model_name}. Smoke test completed successfully." - ) - return - # Set match ratio based on model match_ratio = 4.0 / 5 if model_name.startswith("Phi-4-multimodal-instruct"): diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 303fa1b0d4..776ef654ef 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -655,6 +655,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] @@ -696,8 +697,6 @@ test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepS test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] @@ -705,9 +704,7 @@ test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-mult test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] -test_e2e.py::test_ptp_quickstart_multimodal_2gpu[gemma-3-27b-it-gemma/gemma-3-27b-it] test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] -test_e2e.py::test_ptp_quickstart_multimodal_multiturn[gemma-3-27b-it-gemma/gemma-3-27b-it] test_e2e.py::test_ptp_quickstart_multimodal_multiturn[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 91474ba213..2777b44736 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -225,6 +225,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] @@ -262,8 +263,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 515957f2b3..18384c74d8 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -390,6 +390,7 @@ accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] test_e2e.py::test_openai_chat_harmony @@ -455,14 +456,10 @@ test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepS test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] -test_e2e.py::test_ptp_quickstart_multimodal_2gpu[gemma-3-27b-it-gemma/gemma-3-27b-it] test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] -test_e2e.py::test_ptp_quickstart_multimodal_multiturn[gemma-3-27b-it-gemma/gemma-3-27b-it] test_e2e.py::test_ptp_quickstart_multimodal_multiturn[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index d7906b794f..c29d5ab756 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -48,6 +48,7 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_without_reuse_disable_overlap_scheduler + - accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90) @@ -108,7 +109,6 @@ l0_h100: - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B] - test_e2e.py::test_openai_chat_harmony - test_e2e.py::test_openai_responses - - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] TIMEOUT (90) - test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B] # ------------- AutoDeploy tests --------------- - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] From 25db9e7b3ea360206a6d15e437e6a9a6e3b69972 Mon Sep 17 00:00:00 2001 From: shuyixiong <219646547+shuyixiong@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:24:43 +0800 Subject: [PATCH 137/172] [https://nvbugs/5741060][chore] Waive all pg operator tests (#9991) Signed-off-by: Shuyi Xiong <219646547+shuyixiong@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8acd4d9178..93354a5f12 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -441,3 +441,4 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2 unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[True] SKIP (https://nvbugs/5739981) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_serve[True] SKIP (https://nvbugs/5739981) +unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py SKIP (https://nvbugs/5741060) From f2aee0db03396f856f05adeef6e375ab97f1540f Mon Sep 17 00:00:00 2001 From: Ziyi Xiong <219238287+ziyixiong-nv@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:28:54 +0800 Subject: [PATCH 138/172] [TRTLLM-9854][feat] Optimize the host overhead of _sample_async (#9935) Signed-off-by: ziyixiong-nv <219238287+ziyixiong-nv@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/sampler.py | 76 +++++++++++++++++++ .../_torch/sampler/test_torch_sampler.py | 9 ++- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 83826eaad7..c358c9eefe 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -968,6 +968,23 @@ class TorchSampler(Sampler, AsyncWorkerMixin): def _use_beam_search(self) -> bool: return self.max_beam_width > 1 + def _can_use_fast_greedy_path(self, requests: list[LlmRequest]) -> bool: + """ + Check if we can use the fast argmax path for greedy sampling. + """ + + # Check if all requests use greedy sampling and don't require features + # that the fast path skips + for req in requests: + # vocab_size doesn't affect greediness check + if _request_strategy(req, vocab_size=2**31) != GREEDY: + return False + + # Fast path skips logprobs handling + if req.py_return_log_probs: + return False + return True + @staticmethod def _meet_max_token_stop_criteria( request: LlmRequest, max_seq_len: int, beam_idx: int = DEFAULT_BEAM_IDX @@ -1882,6 +1899,34 @@ class TorchSampler(Sampler, AsyncWorkerMixin): d2t = model_outputs["d2t"][tokens] tokens += d2t + @staticmethod + @nvtx_range("fast_greedy_sample_kernel") + def _fast_greedy_sample_kernel( + logits_cuda: torch.Tensor, + new_tokens_cuda: torch.Tensor, + batch_dest_indices: torch.Tensor, + max_beam_width: int, + d2t: torch.Tensor | None, + ) -> None: + """Applies fast greedy sampling to the logits. + + Performs argmax, applies d2t translation if present, and scatters + tokens into the output buffer. All operations are in-place. + """ + # Simple argmax for greedy sampling + next_tokens = torch.argmax(logits_cuda, dim=-1).to(dtype=new_tokens_cuda.dtype) + + # Apply draft-to-target token translation if present (for Eagle3) + if d2t is not None: + next_tokens += d2t[next_tokens] + + # Scatter tokens into output buffer + batch_dest_indices_expanded = batch_dest_indices.unsqueeze(1).expand(-1, max_beam_width) + next_tokens_expanded = next_tokens.unsqueeze(1).expand(-1, max_beam_width) + new_tokens_cuda.view(-1, *new_tokens_cuda.shape[2:]).scatter_( + 0, batch_dest_indices_expanded, next_tokens_expanded + ) + @staticmethod def _apply_embedding_bias( logits: torch.Tensor, @@ -2372,6 +2417,7 @@ class TorchSampler(Sampler, AsyncWorkerMixin): if (r.py_stop_words_list is not None and len(r.py_stop_words_list[0]) > 0) ] + @nvtx_range("_write_finish_reasons") def _write_finish_reasons( self, requests: list[LlmRequest], @@ -2637,6 +2683,36 @@ class TorchSampler(Sampler, AsyncWorkerMixin): sampling_requests_metadata.req_num_beams, ) + # Fast path for greedy sampling + if self._can_use_fast_greedy_path(requests): + # Compute destination indices on CPU (same pattern as _unbatch_sampling_results) + batch_destination_indexer = _UnpackedStepIndexer( + seq_slots=seq_slots, + num_steps=sampling_requests_metadata.req_num_generated_tokens, + steps_dim_size=new_tokens_cuda.size(0), + slots_dim_size=new_tokens_cuda.size(1), + dim_order=_UnpackedStepIndexer.DimOrder.STEP_MAJOR, + index_dtype=torch.int64, + ) + batch_dest_indices_cuda = batch_destination_indexer[:].to( + new_tokens_cuda.device, non_blocking=True + ) + + # Get d2t tensor if present + d2t = model_outputs.get("d2t", None) + + # Run compiled kernel for argmax, d2t application, and scatter + self._fast_greedy_sample_kernel( + logits_cuda, + new_tokens_cuda, + batch_dest_indices_cuda, + self.max_beam_width, + d2t, + ) + + new_tokens_host = self._copy_to_host(new_tokens_cuda) + return new_tokens_host + # Indexer for accessing tokens in 'logits_cuda', corresponding to the # requests in 'requests'. steps_dim_size = new_tokens_cuda.size(0) diff --git a/tests/unittest/_torch/sampler/test_torch_sampler.py b/tests/unittest/_torch/sampler/test_torch_sampler.py index 2daa357b54..d3ea447676 100644 --- a/tests/unittest/_torch/sampler/test_torch_sampler.py +++ b/tests/unittest/_torch/sampler/test_torch_sampler.py @@ -1565,7 +1565,14 @@ class TestBatchedSampling: num_context_logits_prefix_sum, resource_manager, ) - assert flashinfer_keys_seen + + # Fast greedy path bypasses flashinfer sampling, so flashinfer_keys_seen + # will be empty when all requests are greedy + all_greedy = all( + _request_strategy(req, vocab_size=2**31) == GREEDY + for req in scheduled_requests.all_requests() + ) + assert flashinfer_keys_seen or all_greedy return res patch_ctx.setattr(sampler, "sample_async", _sample_async) From af899d2fe7e876cb2e30fddbfef86cad4e3c9009 Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:46:13 +0800 Subject: [PATCH 139/172] [TRTLLM-9860][doc] Add docs and examples for Responses API (#9946) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- .../commands/trtllm-serve/trtllm-serve.rst | 20 ++- examples/serve/compatibility/README.md | 32 +++-- .../serve/compatibility/responses/README.md | 102 ++++++++++++++ .../responses/example_01_basic_chat.py | 48 +++++++ .../responses/example_02_streaming_chat.py | 98 +++++++++++++ .../example_03_multi_turn_conversation.py | 63 +++++++++ .../responses/example_04_json_mode.py | 80 +++++++++++ .../responses/example_05_tool_calling.py | 132 ++++++++++++++++++ examples/serve/curl_responses_client.sh | 9 ++ examples/serve/openai_responses_client.py | 15 ++ .../llmapi/apps/_test_trtllm_serve_example.py | 4 +- 11 files changed, 590 insertions(+), 13 deletions(-) create mode 100644 examples/serve/compatibility/responses/README.md create mode 100644 examples/serve/compatibility/responses/example_01_basic_chat.py create mode 100644 examples/serve/compatibility/responses/example_02_streaming_chat.py create mode 100644 examples/serve/compatibility/responses/example_03_multi_turn_conversation.py create mode 100644 examples/serve/compatibility/responses/example_04_json_mode.py create mode 100644 examples/serve/compatibility/responses/example_05_tool_calling.py create mode 100644 examples/serve/curl_responses_client.sh create mode 100644 examples/serve/openai_responses_client.py diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst index 25ed2bc394..33bad7f1e5 100644 --- a/docs/source/commands/trtllm-serve/trtllm-serve.rst +++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst @@ -34,7 +34,7 @@ For the full syntax and argument descriptions, refer to :ref:`syntax`. Inference Endpoints ------------------- -After you start the server, you can send inference requests through completions API and Chat API, which are compatible with corresponding OpenAI APIs. We use `TinyLlama-1.1B-Chat-v1.0 `_ for examples in the following sections. +After you start the server, you can send inference requests through completions API, Chat API and Responses API, which are compatible with corresponding OpenAI APIs. We use `TinyLlama-1.1B-Chat-v1.0 `_ for examples in the following sections. Chat API ~~~~~~~~ @@ -66,6 +66,24 @@ Another example uses ``curl``: :language: bash :linenos: +Responses API +~~~~~~~~~~~~~~~ + +You can query Responses API with any http clients, a typical example is OpenAI Python client: + +.. literalinclude:: ../../../../examples/serve/openai_responses_client.py + :language: python + :linenos: + +Another example uses ``curl``: + +.. literalinclude:: ../../../../examples/serve/curl_responses_client.sh + :language: bash + :linenos: + + +More openai compatible examples can be found in the `compatibility examples `_ directory. + Multimodal Serving ~~~~~~~~~~~~~~~~~~ diff --git a/examples/serve/compatibility/README.md b/examples/serve/compatibility/README.md index f3e375843b..5351f269e8 100644 --- a/examples/serve/compatibility/README.md +++ b/examples/serve/compatibility/README.md @@ -34,17 +34,27 @@ python examples/serve/compatibility/chat_completions/example_01_basic_chat.py ### 📋 Complete Example List -All examples demonstrate the `/v1/chat/completions` endpoint: +#### Chat Completions (`/v1/chat/completions`) | Example | File | Description | |---------|------|-------------| -| **01** | `example_01_basic_chat.py` | Basic non-streaming chat completion | -| **02** | `example_02_streaming_chat.py` | Streaming responses with real-time delivery | -| **03** | `example_03_multi_turn_conversation.py` | Multi-turn conversation with context | -| **04** | `example_04_streaming_with_usage.py` | Streaming with continuous token usage stats | -| **05** | `example_05_json_mode.py` | Structured output with JSON schema | -| **06** | `example_06_tool_calling.py` | Function/tool calling with tools | -| **07** | `example_07_advanced_sampling.py` | TensorRT-LLM extended sampling parameters | +| **01** | `chat_completions/example_01_basic_chat.py` | Basic non-streaming chat completion | +| **02** | `chat_completions/example_02_streaming_chat.py` | Streaming responses with real-time delivery | +| **03** | `chat_completions/example_03_multi_turn_conversation.py` | Multi-turn conversation with context | +| **04** | `chat_completions/example_04_streaming_with_usage.py` | Streaming with continuous token usage stats | +| **05** | `chat_completions/example_05_json_mode.py` | Structured output with JSON schema | +| **06** | `chat_completions/example_06_tool_calling.py` | Function/tool calling with tools | +| **07** | `chat_completions/example_07_advanced_sampling.py` | TensorRT-LLM extended sampling parameters | + +#### Responses (`/v1/responses`) + +| Example | File | Description | +|---------|------|-------------| +| **01** | `responses/example_01_basic_chat.py` | Basic non-streaming response | +| **02** | `responses/example_02_streaming_chat.py` | Streaming with event handling | +| **03** | `responses/example_03_multi_turn_conversation.py` | Multi-turn using `previous_response_id` | +| **04** | `responses/example_04_json_mode.py` | Structured output with JSON schema | +| **05** | `responses/example_05_tool_calling.py` | Function/tool calling with tools | ## Configuration @@ -68,8 +78,8 @@ client = OpenAI( Some examples require specific model capabilities: -| Example | Model Requirement | +| Feature | Model Requirement | |---------|------------------| -| 05 (JSON Mode) | xgrammar support | -| 06 (Tool Calling) | Tool-capable model (Qwen3, GPT OSS) | +| JSON Mode | xgrammar support | +| Tool Calling | Tool-capable model (Qwen3, GPT-OSS, Kimi K2) | | Others | Any model | diff --git a/examples/serve/compatibility/responses/README.md b/examples/serve/compatibility/responses/README.md new file mode 100644 index 0000000000..4dbdcf850a --- /dev/null +++ b/examples/serve/compatibility/responses/README.md @@ -0,0 +1,102 @@ +# Responses API Examples + +Examples for the `/v1/responses` endpoint. All examples in this directory use the Responses API, demonstrating features such as streaming, tool/function calling, and multi-turn dialogue. + +## Quick Start + +```bash +# Run the basic example +python example_01_basic_chat.py +``` + +## Examples Overview + +### Basic Examples + +1. **`example_01_basic_chat.py`** - Start here! + - Simple request/response + - Non-streaming mode + - Uses `input` parameter for user message + +2. **`example_02_streaming_chat.py`** - Real-time responses + - Stream tokens as generated + - Handles various event types (`response.created`, `response.output_text.delta`, etc.) + - Server-Sent Events (SSE) + +3. **`example_03_multi_turn_conversation.py`** - Context management + - Multiple conversation turns + - Uses `previous_response_id` to maintain context + - Follow-up questions without resending history + +### Advanced Examples + +4. **`example_04_json_mode.py`** - Structured output + - JSON schema validation via `text.format` + - Structured data extraction + - Requires xgrammar support + +5. **`example_05_tool_calling.py`** - Function calling + - External tool integration + - Function definitions with `tools` parameter + - Tool result handling with `function_call_output` + - Requires compatible model (Qwen3, GPT-OSS, Kimi K2) + +## Key Concepts + +### Non-Streaming vs Streaming + +**Non-Streaming** (`stream=False`): +- Wait for complete response +- Single response object +- Simple to use + +**Streaming** (`stream=True`): +- Tokens delivered as generated +- Better perceived latency +- Server-Sent Events (SSE) + +### Multi-turn Context + +Use `previous_response_id` to continue conversations: +```python +# First turn +response1 = client.responses.create( + model=model, + input="What is 15 multiplied by 23?", +) + +# Second turn - references previous response +response2 = client.responses.create( + model=model, + input="Now divide that result by 5", + previous_response_id=response1.id, +) +``` + +### Tool Calling + +Define functions the model can call: +```python +tools = [{ + "name": "get_weather", + "type": "function", + "description": "Get the current weather in a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"}, + }, + "required": ["location"], + } +}] +``` + +## Model Requirements + +| Feature | Requirement | +|---------|-------------| +| Basic chat | Any model | +| Streaming | Any model | +| Multi-turn | Any model | +| JSON mode | xgrammar support | +| Tool calling | Compatible model (Qwen3, GPT-OSS, Kimi K2) | diff --git a/examples/serve/compatibility/responses/example_01_basic_chat.py b/examples/serve/compatibility/responses/example_01_basic_chat.py new file mode 100644 index 0000000000..237108017f --- /dev/null +++ b/examples/serve/compatibility/responses/example_01_basic_chat.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +"""Example 1: Basic Non-Streaming Responses. + +Demonstrates a simple responses request with the OpenAI-compatible API. +""" + +from openai import OpenAI + +# Initialize the client +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="tensorrt_llm", +) + +# Get the model name from the server +models = client.models.list() +model = models.data[0].id + +print("=" * 80) +print("Example 1: Basic Non-Streaming Responses") +print("=" * 80) +print() + +# Create a simple responses request +response = client.responses.create( + model=model, + input="What is the capital of France?", + max_output_tokens=4096, +) + +# Print the response +print("Response:") +print(f"Content: {response.output_text}") diff --git a/examples/serve/compatibility/responses/example_02_streaming_chat.py b/examples/serve/compatibility/responses/example_02_streaming_chat.py new file mode 100644 index 0000000000..1e6e92d51f --- /dev/null +++ b/examples/serve/compatibility/responses/example_02_streaming_chat.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +"""Example 2: Streaming Responses. + +Demonstrates streaming responses with real-time token delivery. +""" + +from openai import OpenAI + + +def print_streaming_responses_item(item, show_events=True): + event_type = getattr(item, "type", "") + + if event_type == "response.created": + if show_events: + print(f"[Response Created: {getattr(item.response, 'id', 'unknown')}]") + elif event_type == "response.in_progress": + if show_events: + print("[Response In Progress]") + elif event_type == "response.output_item.added": + if show_events: + item_type = getattr(item.item, "type", "unknown") + item_id = getattr(item.item, "id", "unknown") + print(f"\n[Output Item Added: {item_type} (id: {item_id})]") + elif event_type == "response.content_part.added": + if show_events: + part_type = getattr(item.part, "type", "unknown") + print(f"[Content Part Added: {part_type}]") + elif event_type == "response.reasoning_text.delta": + print(item.delta, end="", flush=True) + elif event_type == "response.output_text.delta": + print(item.delta, end="", flush=True) + elif event_type == "response.reasoning_text.done": + if show_events: + print(f"\n[Reasoning Text Done: {len(item.text)} chars]") + elif event_type == "response.output_text.done": + if show_events: + print(f"\n[Output Text Done: {len(item.text)} chars]") + elif event_type == "response.content_part.done": + if show_events: + part_type = getattr(item.part, "type", "unknown") + print(f"[Content Part Done: {part_type}]") + elif event_type == "response.output_item.done": + if show_events: + item_type = getattr(item.item, "type", "unknown") + item_id = getattr(item.item, "id", "unknown") + print(f"[Output Item Done: {item_type} (id: {item_id})]") + elif event_type == "response.completed": + if show_events: + print("\n[Response Completed]") + + +# Initialize the client +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="tensorrt_llm", +) + +# Get the model name from the server +models = client.models.list() +model = models.data[0].id + +print("=" * 80) +print("Example 2: Streaming Responses") +print("=" * 80) +print() + +print("Prompt: Write a haiku about artificial intelligence\n") + +# Create a streaming responses +stream = client.responses.create( + model=model, + input="Write a haiku about artificial intelligence", + max_output_tokens=4096, + stream=True, +) + +# Print tokens as they arrive +print("Response (streaming):") +print("Assistant: ", end="", flush=True) + +current_state = "none" +for event in stream: + print_streaming_responses_item(event) diff --git a/examples/serve/compatibility/responses/example_03_multi_turn_conversation.py b/examples/serve/compatibility/responses/example_03_multi_turn_conversation.py new file mode 100644 index 0000000000..c24c23226e --- /dev/null +++ b/examples/serve/compatibility/responses/example_03_multi_turn_conversation.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +"""Example 3: Multi-turn Conversation. + +Demonstrates maintaining conversation context across multiple turns. +""" + +from openai import OpenAI + +# Initialize the client +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="tensorrt_llm", +) + +# Get the model name from the server +models = client.models.list() +model = models.data[0].id + +print("=" * 80) +print("Example 3: Multi-turn Conversation") +print("=" * 80) +print() + +# First turn: User asks a question +print("USER: What is 15 multiplied by 23?") + +response1 = client.responses.create( + model=model, + input="What is 15 multiplied by 23?", + max_output_tokens=4096, +) + +assistant_reply_1 = response1.output_text +print(f"ASSISTANT: {assistant_reply_1}\n") + +# Second turn: User asks a follow-up question +print("USER: Now divide that result by 5") + +# No context need to be provided for the second turn, only include the previous response id +response2 = client.responses.create( + model=model, + input="Now divide that result by 5", + max_output_tokens=4096, + previous_response_id=response1.id, +) + +assistant_reply_2 = response2.output_text +print(f"ASSISTANT: {assistant_reply_2}") diff --git a/examples/serve/compatibility/responses/example_04_json_mode.py b/examples/serve/compatibility/responses/example_04_json_mode.py new file mode 100644 index 0000000000..83d4b9be20 --- /dev/null +++ b/examples/serve/compatibility/responses/example_04_json_mode.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +"""Example 4: JSON Mode with Schema. + +Demonstrates structured output generation with JSON schema validation. + +Note: This requires xgrammar support and compatible model configuration. +""" + +import json + +from openai import OpenAI + +# Initialize the client +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="tensorrt_llm", +) + +# Get the model name from the server +models = client.models.list() +model = models.data[0].id + +print("=" * 80) +print("Example 4: JSON Mode with Schema") +print("=" * 80) +print() + +# Define the JSON schema +schema = { + "type": "json_schema", + "name": "city_info", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "country": {"type": "string"}, + "population": {"type": "integer"}, + "famous_for": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["name", "country", "population"], + }, + "strict": True, +} + +print("Request with JSON schema:") +print(json.dumps(schema, indent=2)) +print() +print("Note: JSON schema support requires xgrammar and compatible model configuration.\n") + +try: + # Create responses with JSON schema + response = client.responses.create( + model=model, + instructions="You are a helpful assistant that outputs JSON.", + input="Give me information about Tokyo.", + text={"format": schema}, + reasoning={"effort": "low"}, + max_output_tokens=1024, + ) + + print("JSON Response:") + print(response.output_text) +except Exception as e: + print("JSON schema support requires xgrammar and proper configuration.") + print(f"Error: {e}") diff --git a/examples/serve/compatibility/responses/example_05_tool_calling.py b/examples/serve/compatibility/responses/example_05_tool_calling.py new file mode 100644 index 0000000000..6489e7e453 --- /dev/null +++ b/examples/serve/compatibility/responses/example_05_tool_calling.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 +"""Example 5: Tool/Function Calling. + +Demonstrates tool calling with function definitions and responses. + +Note: This requires a compatible model (e.g., Qwen3, GPT-OSS, Kimi K2). +""" + +import json + +from openai import OpenAI + +# Initialize the client +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="tensorrt_llm", +) + +# Get the model name from the server +models = client.models.list() +model = models.data[0].id +TOOL_CALL_SUPPORTED_MODELS = ["Qwen3", "GPT-OSS", "Kimi K2"] + +print("=" * 80) +print("Example 5: Tool/Function Calling") +print("=" * 80) +print() +print( + f"Note: Tool calling requires compatible models (e.g. {', '.join(TOOL_CALL_SUPPORTED_MODELS)})\n" +) + +# Define the available tools +tools = [ + { + "name": "get_weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + "type": "function", + "description": "Get the current weather in a location", + } +] + + +def get_weather(location: str, unit: str = "fahrenheit") -> dict: + return {"location": location, "temperature": 68, "unit": unit, "conditions": "sunny"} + + +def process_tool_call(response) -> tuple[dict, str]: + function_name = None + function_arguments = None + tool_call_id = None + for output in response.output: + if output.type == "function_call": + function_name = output.name + function_arguments = json.loads(output.arguments) + tool_call_id = output.call_id + break + + try: + print( + f"Get tool call result:\n\ttool_name: {function_name}\n\tparameters: {function_arguments})" + ) + result = eval(f"{function_name}(**{function_arguments})") + except Exception as e: + print(f"Error processing tool call: {e}") + return None, None + + return result, tool_call_id + + +print("Available tools:") +print(json.dumps(tools, indent=2)) +print("\nUser query: What is the weather in San Francisco?\n") + +try: + # Initial request with tools + response = client.responses.create( + model=model, + input="What is the weather in San Francisco?", + tools=tools, + tool_choice="auto", + max_output_tokens=4096, + ) + + tool_call_result, tool_call_id = process_tool_call(response) + call_input = [ + { + "type": "function_call_output", + "call_id": tool_call_id, + "output": json.dumps(tool_call_result), + } + ] + + prev_response_id = response.id + response = client.responses.create( + model=model, + input=call_input, + previous_response_id=prev_response_id, + tools=tools, + ) + + print(f"Final response: {response.output_text}") + +except Exception as e: + print( + f"Note: Tool calling requires model support (e.g. {', '.join(TOOL_CALL_SUPPORTED_MODELS)})" + ) + print(f"Error: {e}") diff --git a/examples/serve/curl_responses_client.sh b/examples/serve/curl_responses_client.sh new file mode 100644 index 0000000000..7a54f21bb8 --- /dev/null +++ b/examples/serve/curl_responses_client.sh @@ -0,0 +1,9 @@ +#! /usr/bin/env bash + +curl http://localhost:8000/v1/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "TinyLlama-1.1B-Chat-v1.0", + "input": "Where is New York?", + "max_output_tokens": 16 + }' diff --git a/examples/serve/openai_responses_client.py b/examples/serve/openai_responses_client.py new file mode 100644 index 0000000000..04d1b356b7 --- /dev/null +++ b/examples/serve/openai_responses_client.py @@ -0,0 +1,15 @@ +### :title OpenAI Responses Client + +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="tensorrt_llm", +) + +response = client.responses.create( + model="TinyLlama-1.1B-Chat-v1.0", + input="Where is New York?", + max_output_tokens=20, +) +print(response) diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py index 6921c024d5..7828b94b87 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py @@ -52,9 +52,11 @@ def example_root(): "exe, script", [("python3", "openai_chat_client.py"), ("python3", "openai_completion_client.py"), ("python3", "openai_completion_client_json_schema.py"), + ("python3", "openai_responses_client.py"), ("bash", "curl_chat_client.sh"), ("bash", "curl_completion_client.sh"), - ("bash", "genai_perf_client.sh")]) + ("bash", "genai_perf_client.sh"), + ("bash", "curl_responses_client.sh")]) def test_trtllm_serve_examples(exe: str, script: str, server: RemoteOpenAIServer, example_root: str): client_script = os.path.join(example_root, script) From 75880297637dcdd5987db2befd18f9e0e9f44456 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:03:46 +0800 Subject: [PATCH 140/172] [None][feat] Async pp send for PPCommTorch. (#9976) Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp | 2 + tensorrt_llm/_torch/device_mesh.py | 12 ++-- .../_torch/distributed/communicator.py | 59 +++++-------------- tensorrt_llm/mapping.py | 11 ++-- 4 files changed, 30 insertions(+), 54 deletions(-) diff --git a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp index 75ae96f36b..a45fa955a0 100644 --- a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp +++ b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp @@ -33,6 +33,7 @@ NcclCommunicatorOp::NcclCommunicatorOp(int64_t worldSize, int64_t rank) void NcclCommunicatorOp::send(th::Tensor tensor, int64_t toRank) const { + tensor.record_stream(at::cuda::getCurrentCUDAStream()); auto ptr = static_cast(tensor.data_ptr()); size_t const size = tensor.numel() * th::elementSize(th::typeMetaToScalarType(tensor.dtype())); tensorrt_llm::runtime::CudaStream cudaStream{at::cuda::getCurrentCUDAStream().stream(), mRank, false}; @@ -41,6 +42,7 @@ void NcclCommunicatorOp::send(th::Tensor tensor, int64_t toRank) const void NcclCommunicatorOp::recv(th::Tensor& tensor, int64_t fromRank) const { + tensor.record_stream(at::cuda::getCurrentCUDAStream()); auto ptr = static_cast(tensor.data_ptr()); size_t const size = tensor.numel() * th::elementSize(th::typeMetaToScalarType(tensor.dtype())); tensorrt_llm::runtime::CudaStream cudaStream{at::cuda::getCurrentCUDAStream().stream(), mRank, false}; diff --git a/tensorrt_llm/_torch/device_mesh.py b/tensorrt_llm/_torch/device_mesh.py index ca8db83385..b5034f8ef7 100644 --- a/tensorrt_llm/_torch/device_mesh.py +++ b/tensorrt_llm/_torch/device_mesh.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, List import torch import torch.distributed as dist -from torch.distributed import get_process_group_ranks +from torch.distributed import ProcessGroup, get_process_group_ranks from torch.distributed.device_mesh import init_device_mesh from tensorrt_llm.logger import logger @@ -48,27 +48,27 @@ class DeviceMeshTopologyImpl(_MappingBaseForTypeCheck): # Access Torch ProcessGroup @property @require_device_mesh - def tp_group_pg(self): + def tp_group_pg(self) -> ProcessGroup: return self._get_mesh_dim_by_name('tp').get_group() @property @require_device_mesh - def pp_group_pg(self): + def pp_group_pg(self) -> ProcessGroup: return self._get_mesh_dim_by_name('pp').get_group() @property @require_device_mesh - def cp_group_pg(self): + def cp_group_pg(self) -> ProcessGroup: return self._get_mesh_dim_by_name('cp').get_group() @property @require_device_mesh - def moe_tp_group_pg(self): + def moe_tp_group_pg(self) -> ProcessGroup: return self._get_mesh_dim_by_name('moe_tp').get_group() @property @require_device_mesh - def moe_ep_group_pg(self): + def moe_ep_group_pg(self) -> ProcessGroup: return self._get_mesh_dim_by_name('moe_ep').get_group() # Access rank diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py index 93457691bd..18c7e7a637 100644 --- a/tensorrt_llm/_torch/distributed/communicator.py +++ b/tensorrt_llm/_torch/distributed/communicator.py @@ -16,7 +16,6 @@ try: except Exception: MPI = None # deferred; functions will error if used when ENABLE_MULTI_DEVICE is True -from tensorrt_llm._torch.hostfunc import hostfunc from tensorrt_llm._utils import (mpi_allgather, mpi_barrier, mpi_comm, mpi_disabled, mpi_isend, mpi_isend_object, mpi_recv, mpi_recv_object, mpi_send, @@ -783,26 +782,16 @@ class TorchDist(Distributed): return ret[0] -class PPCommBase: +class PPCommNCCL: def __init__(self, global_mapping: Mapping): self.mapping = global_mapping + self.nccl_comm = torch.classes.trtllm.NcclCommunicatorOp( + self.mapping.world_size, + self.mapping.rank, + ) self.tensor_ready_event = torch.cuda.Event() self.send_stream = torch.cuda.Stream() - self.tensor_cache = {} - - def _cache_tensor(self, tensor: torch.Tensor): - cache_id = id(tensor) - self.tensor_cache[cache_id] = tensor - - @hostfunc - def _release_tensor(self, tensor: torch.Tensor): - cache_id = id(tensor) - del self.tensor_cache[cache_id] - - @abstractmethod - def direct_send(self, tensor: torch.Tensor, dest: int): - raise NotImplementedError("direct_send is not implemented") def send(self, tensor: torch.Tensor, dest: Optional[int] = None): if dest is None: @@ -811,30 +800,13 @@ class PPCommBase: # NCCL send kernel in send_stream cannot be captured, # so we send in the current stream instead in CUDA graph cases. if torch.cuda.is_current_stream_capturing(): - self.direct_send(tensor, dest) + self.nccl_comm.send(tensor, dest) return self.tensor_ready_event.record() with torch.cuda.stream(self.send_stream): self.tensor_ready_event.wait() - # tensor may be released before NCCL send finished, - # so we cache it first and release it after send finished. - self._cache_tensor(tensor) - self.direct_send(tensor, dest) - self._release_tensor(tensor) - - -class PPCommNCCL(PPCommBase): - - def __init__(self, global_mapping: Mapping): - super().__init__(global_mapping) - self.nccl_comm = torch.classes.trtllm.NcclCommunicatorOp( - self.mapping.world_size, - self.mapping.rank, - ) - - def direct_send(self, tensor: torch.Tensor, dest: int): - self.nccl_comm.send(tensor, dest) + self.nccl_comm.send(tensor, dest) def recv(self, tensor: torch.Tensor, src: Optional[int] = None): if src is None: @@ -842,10 +814,10 @@ class PPCommNCCL(PPCommBase): self.nccl_comm.recv(tensor, src) -class PPCommTorch(PPCommBase): +class PPCommTorch: def __init__(self, global_mapping: Mapping): - super().__init__(global_mapping) + self.mapping = global_mapping self.pg = self.mapping.pp_group_pg self.pg_group = self.mapping.pp_group @@ -853,21 +825,22 @@ class PPCommTorch(PPCommBase): assert global_rank in self.pg_group return self.pg_group.index(global_rank) - def direct_send(self, tensor: torch.Tensor, dest: int): - self.pg.send([tensor], self._global_to_local_rank(dest), tag=0).wait() - - # TODO: support async pp send for PPCommTorch def send(self, tensor: torch.Tensor, dest: Optional[int] = None): if dest is None: dest = self.mapping.next_pp_rank() - self.pg.send([tensor], self._global_to_local_rank(dest), tag=0).wait() + work = self.pg.send([tensor], self._global_to_local_rank(dest), tag=0) + # Send operation cannot be captured without blocking wait, + # so we block the current stream in CUDA graph cases. + if torch.cuda.is_current_stream_capturing(): + work.block_current_stream() def recv(self, tensor: torch.Tensor, src: Optional[int] = None): if src is None: src = self.mapping.prev_pp_rank() - self.pg.recv([tensor], self._global_to_local_rank(src), tag=0).wait() + work = self.pg.recv([tensor], self._global_to_local_rank(src), tag=0) + work.block_current_stream() _pp_comm = None diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py index e8f0648547..386d18da74 100644 --- a/tensorrt_llm/mapping.py +++ b/tensorrt_llm/mapping.py @@ -16,6 +16,7 @@ from enum import IntEnum from typing import List import torch +from torch.distributed import ProcessGroup from tensorrt_llm._torch.device_mesh import DeviceMeshTopologyImpl from tensorrt_llm._utils import mpi_disabled @@ -518,23 +519,23 @@ class Mapping(MappingBase): # DeviceMesh specific methods @property - def tp_group_pg(self): + def tp_group_pg(self) -> ProcessGroup: raise NotImplementedError("tp_group_pg is not implemented.") @property - def pp_group_pg(self): + def pp_group_pg(self) -> ProcessGroup: raise NotImplementedError("pp_group_pg is not implemented.") @property - def cp_group_pg(self): + def cp_group_pg(self) -> ProcessGroup: raise NotImplementedError("cp_group_pg is not implemented.") @property - def moe_tp_group_pg(self): + def moe_tp_group_pg(self) -> ProcessGroup: raise NotImplementedError("moe_tp_group_pg is not implemented.") @property - def moe_ep_group_pg(self): + def moe_ep_group_pg(self) -> ProcessGroup: raise NotImplementedError("moe_ep_group_pg is not implemented.") def build_mesh(self): From dda76583062727d6e936880bc85e3b4bf1c95c2f Mon Sep 17 00:00:00 2001 From: Void <18275976+yilin-void@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:22:56 +0800 Subject: [PATCH 141/172] [https://nvbugs/5655885][fix] fix invalid instruction error in 2shot ar kernel on Ampere (#9394) Signed-off-by: Yilin Zhang <18275976+yilin-void@users.noreply.github.com> --- .../kernels/communicationKernels/allReduceFusionKernels.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu index 2f6ac3fab7..25c662534d 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu @@ -137,11 +137,17 @@ public: // corresponding CTA has not been launched. for (int flag_idx = blockIdx.x; flag_idx < kBarrierFlagCount; flag_idx += gridDim.x) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) asm volatile( "st.global.relaxed.sys.b32 [%1], %0;" ::"r"(m_flag_value), "l"(m_target_flag + flag_idx * NRanks)); +#else + st_flag(m_target_flag + flag_idx * NRanks, m_flag_value); +#endif } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) // Single release fence asm volatile("fence.release.sys;"); +#endif while (ld_flag(m_current_flag) == prev_flag(m_flag_value)) { From 504ede707e5b5fc65c19704510ee8ffc3067752d Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:41:30 +0800 Subject: [PATCH 142/172] [None] [fix] Fix nsys_on argument for slurm scripts (#9995) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 5263dadc2f..881b08664a 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -261,7 +261,7 @@ def submit_job(config, log_dir): config['benchmark']['concurrency_list'], str(slurm_config['numa_bind']), log_dir, - str(profiling_config['nsys_on']), + str(profiling_config['nsys_on']).lower(), profiling_config['gen_profile_range'] if server_type == "GEN" else profiling_config['ctx_profile_range'], gen_config_path if server_type == "GEN" else ctx_config_path, From 3c98b2500584552bf46f5bc7b5a46163958fb7f7 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Mon, 15 Dec 2025 15:14:24 +0800 Subject: [PATCH 143/172] [None][chore] Add failed cases into waives.txt (#9941) Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 93354a5f12..23262dedc1 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -442,3 +442,5 @@ unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_a unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[True] SKIP (https://nvbugs/5739981) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_serve[True] SKIP (https://nvbugs/5739981) unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py SKIP (https://nvbugs/5741060) +full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] SKIP (https://nvbugs/5596337) +full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5596337) From 825025b1370585020bc3927d60b4cd208deacda0 Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Mon, 15 Dec 2025 15:55:54 +0800 Subject: [PATCH 144/172] [None][infra] Add multi gpu Ray tests into L0 merge change request list. (#9996) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- jenkins/L0_MergeRequest.groovy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 60f48063ee..a8e5789589 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -738,6 +738,9 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars) "tests/unittest/llmapi/test_llm_multi_gpu.py", "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py", "tests/integration/defs/accuracy/test_disaggregated_serving.py", + "tests/unittest/_torch/ray_orchestrator/multi_gpu/", + "tests/integration/defs/examples/test_ray.py", + "tests/unittest/llmapi/test_async_llm.py", ] def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars) From 83885c69e78d2c074f61fdd1c4bb89e4c0031e9d Mon Sep 17 00:00:00 2001 From: Grzegorz Kwasniewski <213329731+greg-kwasniewski1@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:52:29 +0100 Subject: [PATCH 145/172] [TRTLLM-9136][feat] 2D parallel EP TP support (#9459) Signed-off-by: greg-kwasniewski1 <213329731+greg-kwasniewski1@users.noreply.github.com> --- .../advanced/expert_configurations.md | 19 + .../_torch/auto_deploy/transform/interface.py | 8 +- .../transform/library/collectives.py | 2 +- .../auto_deploy/transform/library/sharding.py | 2022 +++++++++++++++-- .../_torch/auto_deploy/utils/node_utils.py | 4 +- .../auto_deploy/utils/sharding_utils.py | 1840 --------------- .../_utils_test/_graph_test_helpers.py | 6 + .../multigpu/test_ad_allreduce_strategies.py | 25 +- .../library/test_bmm_sharding.py | 15 +- .../library/test_ep_sharding.py | 33 +- .../library/test_tp_sharding.py | 34 +- .../utils/test_quantization_utils.py | 2 +- 12 files changed, 1941 insertions(+), 2069 deletions(-) delete mode 100644 tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py diff --git a/docs/source/torch/auto_deploy/advanced/expert_configurations.md b/docs/source/torch/auto_deploy/advanced/expert_configurations.md index 4df92f0cf7..cf4c2c94dd 100644 --- a/docs/source/torch/auto_deploy/advanced/expert_configurations.md +++ b/docs/source/torch/auto_deploy/advanced/expert_configurations.md @@ -190,6 +190,25 @@ Specifies which sharding dimensions to apply during heuristic sharding. The avai You can enable multiple dimensions simultaneously. For example, `['tp', 'ep']` will apply both tensor parallelism and expert parallelism. +#### `process_grid` (dict, default: `None`) + +Specifies a 2D device mesh for hybrid EP+TP parallelism. + +- NOTE 1: This grid applies only to the MoE layers. Attention, Mamba, and MLP layers are unaffected. +- NOTE 2: The order of the keys matters. Process grid's layout is in the generalized column-major order, + that is, the last dimension is stride-one. +- NOTE 3: `ep * tp` must be equal to the provided world size. Otherwise, the mesh will be considered invalid, + and 1D ep-only parallelism will be applied. + +Example: + +``` + process_grid: {'ep': 2, 'tp': 2} +``` + +If `world_size == 4`, ranks \[0,1\] and \[2,3\] will create two EP groups. Experts will be distributed across these two +groups, and internally, TP=2 column-row sharding will be applied. + #### `requires_shape_prop` (bool, default: `true`) Whether shape propagation is required before applying this transform. Shape propagation enables the transform to make informed decisions about sharding strategies based on tensor dimensions. diff --git a/tensorrt_llm/_torch/auto_deploy/transform/interface.py b/tensorrt_llm/_torch/auto_deploy/transform/interface.py index 24b58f0a70..6c2c69c7f8 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/interface.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/interface.py @@ -24,7 +24,6 @@ from ..utils._graph import ( run_shape_prop, ) from ..utils.logger import ad_logger -from ..utils.sharding_utils import ShardingTransformContainer class TransformError(Exception): @@ -61,9 +60,10 @@ class Stages(Enum): class SharedConfig(BaseModel): """Global config shared between multiple transforms in the inference optimizer.""" - sharding_transform_container: ShardingTransformContainer = Field( - default_factory=ShardingTransformContainer - ) + model_config = { + # to provide an easy way to do config validation of child config classes with more fields + "extra": "allow", + } local_rank: int = Field(default=0) world_size: int = Field(default=1) diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py index 6eb5371f40..85dc6c48be 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py @@ -122,7 +122,7 @@ class FuseAllreduceResidualRMSNorm(BaseTransform): # ============================================================================ # Get the allreduce strategy from shared_config - strategy = shared_config.sharding_transform_container.allreduce_strategy.name + strategy = shared_config.sharding_transform_container.config.allreduce_strategy.name # TRT-LLM backend (MPI mode) - two patterns for different addition orders _allreduce_residual_rmsnorm_pattern_trtllm = _make_allreduce_residual_rmsnorm_pattern( diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py index 02f7e226c1..bae85f3a22 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py @@ -16,19 +16,27 @@ Our sharding algorithm for tensor parallelism (TP) is based on the following ste happens automatically via the checkpoint loading hook added in step 2c. """ +import math +import operator import re -from typing import Any, Dict, List, Tuple, Type, Union +from abc import ABC, abstractmethod +from enum import Enum, IntEnum +from functools import partial +from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union import torch -from pydantic import Field, field_validator +import torch.nn as nn +from pydantic import BaseModel, Field, field_validator from torch.fx import GraphModule, Node from .....functional import AllReduceStrategy +from ...custom_ops.trtllm_dist import is_trtllm_op_available from ...models.factory import ModelFactory, ShardingConfigSource from ...shim.interface import CachedSequenceInterface from ...utils.logger import ad_logger from ...utils.node_utils import ( bfs, + extract_param_names_from_node, extract_weight_node, filtered_nodes, get_all_layer_subgraphs, @@ -37,22 +45,12 @@ from ...utils.node_utils import ( is_any_moe_op, is_any_ssm_op, is_op, + num_users_of_weight_node, subgraph, ) -from ...utils.sharding_utils import ( - BMMShardingInfo, - DistBackend, - EPShardingInfo, - LayerType, - ParameterUpdateInfo, - ShardingDim, - ShardingSource, - ShardingTransformContainer, - ShardingTransformInfo, - SplitDimension, - WeightShardingInfo, - get_all_weights_in_subgraph, - validate_allreduce_strategy, +from ...utils.quantization_utils import ( + cutlass_fp4_scale_to_modelopt_fp4_scale, + modelopt_fp4_scale_to_cutlass_fp4_scale, ) from ..interface import ( BaseTransform, @@ -63,6 +61,67 @@ from ..interface import ( ) +######################################################## +# Helper enums +######################################################## +class ShardingSource(Enum): + """Enum for sharding source.""" + + HEURISTIC = "heuristic" + FACTORY = "factory" + MANUAL = "manual" + + +class ShardingDim(Enum): + """Enum for sharding dimension.""" + + SSM = "ssm" + TP = "tp" + EP = "ep" + BMM = "bmm" + + +class SplitDimension(IntEnum): + """Enum for tensor split dimensions in sharding.""" + + # NOTE: The names COLUMN/ROW reflect the hugging face + # base_tp_plan sharding notation, but since we assume Y = W @ X^T, + # when splitting weight matrix W^T across columns, the actual split + # is over dimension 0 + COLUMN = 0 + ROW = 1 + + +class DistBackend(Enum): + """Enum for distributed backend.""" + + AUTO = "auto" + TRTLLM = "trtllm" + TORCH = "torch" + + +class LayerType(Enum): + """Enum for layer type.""" + + ATTENTION = "attention" + MAMBA = "mamba" + MLP = "mlp" + MOE = "moe" + + +class MLPType(Enum): + """Enum for MLP type.""" + + GATED_MLP = "gated_mlp" # explicit three weights: up, down, gate (in this order) + MLP = "mlp" # two weights: up, down + FUSED_GATED_MLP = ( + "fused_gated_mlp" # fused three weights (two inputs) up_gate, down (in this order) + ) + + +######################################################## +# Sharding classes +######################################################## class ShardingTransformConfig(TransformConfig): """Configuration for sharding the model.""" @@ -88,6 +147,63 @@ class ShardingTransformConfig(TransformConfig): "LOWPRECISION, UB, MNNVL, NCCL_SYMMETRIC", ) + process_grid: Dict[ShardingDim, int] = Field(default_factory=dict) + + def validate_config(self, sources: Union[ShardingSource, List[ShardingSource]] = None) -> bool: + init_process_grid_from_config(self) + if sources is None: + sources = [ShardingSource.FACTORY, ShardingSource.MANUAL] + if not isinstance(sources, list): + sources = [sources] + for source in sources: + config = self.manual_config if source == ShardingSource.MANUAL else self.factory_config + if ( + source == ShardingSource.FACTORY + and self.factory_source != ShardingConfigSource.HUGGINGFACE + ): + if "source" in config: + self.factory_source = config["source"] + if self.factory_source != ShardingConfigSource.HUGGINGFACE: + ad_logger.debug( + "Sharding config is currently only supported for HuggingFace. Skipping." + ) + config.clear() + continue + + if "head_dim" not in config: + ad_logger.debug("Sharding config does not contain head_dim. Skipping.") + # invalidate the config + config.clear() + continue + + if "tp_plan" not in config or config["tp_plan"] is None or len(config["tp_plan"]) == 0: + ad_logger.debug("Sharding config does not contain tp_plan. Skipping.") + # invalidate the config + config.clear() + continue + + tp_plan = config["tp_plan"] + + values = set(tp_plan.values()) + supported_modes = { + "colwise", # row split and no collective + "rowwise", # column split and all-reduce + "mamba", # mamba SSM layer + "gather", # simple shard (row + all_gather) + # TODO: remaining values are not supported yet. + # They require hybrid EP+TP and/or SP support. + # "sequence_parallel", # sequence parallelism + # "local_colwise", + # "local_rowwise", + # "local_packed_rowwise", + # "local", + } + if not self.support_partial_config and not values.issubset(supported_modes): + ad_logger.debug("Sharding config contains invalid values. Skipping.") + # invalidate the config + config.clear() + continue + @field_validator("allreduce_strategy", mode="before") @classmethod def _validate_allreduce_strategy(cls, v): @@ -97,6 +213,578 @@ class ShardingTransformConfig(TransformConfig): dist_backend: DistBackend = Field(default=DistBackend.AUTO) +class ShardingTransformInfo(BaseModel, ABC): + """Abstract base class for transformation configurations.""" + + target_node: str + config: ShardingTransformConfig + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + """ + Validate whether the transformation is valid. + Execute right before applying the transformation. + """ + return True + + @abstractmethod + def apply(self, gm: GraphModule, node: Node) -> None: + """Apply the transformation to the graph module. + + This method must be implemented by each transformation class. + """ + pass + + def check_and_apply(self, gm: GraphModule, node: Node) -> bool: + """ + Check if the transformation is valid and apply it if it is. + Return True if the transformation is applied, False otherwise. + """ + if not self.validate(gm, node): + ad_logger.warning(f"Skipping invalid transformation {self}.") + return False + self.apply(gm, node) + return True + + def __hash__(self) -> int: + """Make the transform info hashable by excluding the config field. + + The config field is excluded because: + 1. It may not be hashable (ShardingTransformConfig is mutable) + 2. Tests set config=None before comparison anyway + """ + # Get all fields except 'config' for hashing + field_values = [] + for field_name, field_info in self.model_fields.items(): + if field_name != "config": + value = getattr(self, field_name) + # Handle enums + if isinstance(value, (Enum, IntEnum)): + field_values.append(value.value) + else: + field_values.append(value) + return hash(tuple(field_values)) + + +class WeightShardingInfo(ShardingTransformInfo): + """Configuration for TP sharding transformations.""" + + split_dim: SplitDimension + dist_op: Optional[Literal["all_reduce", "all_gather"]] = None + min_local_shape: int = 1 + layer_type: LayerType = LayerType.MLP + # used for TP sharding of fused weights + fused_weight_dims: Optional[list] = None + + def quantization_cb( + self, + gm: GraphModule, + submod: nn.Module, + node: Node, + weight_key: str, + weight_new_shape: torch.Size, + dim: int, + rank: int, + world_size: int, + ) -> None: + """Quantization callback. Default does nothing for non-quantized models.""" + return None + + @classmethod + def from_node(cls, node: Node, **kwargs) -> "WeightShardingInfo": + """ + Create the correct TPShardingInfo subclass (FP8/FP4/base) based on `node`. + """ + subcls = _resolve_tp_cls_from_node(node) + return subcls(target_node=node.name, **kwargs) + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + """Validate the transformation configuration.""" + if self.dist_op is not None: + if self.split_dim == SplitDimension.COLUMN: + if self.dist_op == "all_reduce": + ad_logger.warning( + f"Column split is only supported for all_gather. Skipping {self}." + ) + return False + if self.split_dim == SplitDimension.ROW: + if self.dist_op == "all_gather": + ad_logger.warning( + f"Row split is only supported for all_reduce. Skipping {self}." + ) + return False + return True + + def apply(self, gm: GraphModule, node: Node) -> None: + """Apply TP sharding transformation to the graph module.""" + _shard_parameter_node( + gm=gm, + node=node, + dim=self.split_dim.value, + config=self.config, + add_dist=self.dist_op is not None, + min_local_shape=self.min_local_shape, + fused_weight_dims=self.fused_weight_dims, + quantization_cb=self.quantization_cb, + ) + + +class ParameterUpdateInfo(ShardingTransformInfo): + """Configuration for node args sharding transformations.""" + + args: tuple + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + """Validate the transformation configuration.""" + return len(node.args) == len(self.args) + + def apply(self, gm: GraphModule, node: Node) -> None: + """Apply the transformation to the graph module.""" + _update_node_args(node, self.args) + + +class QuantizationShardingMixin(ABC): + """ + Mixin that provides a callback to handle quantization-aware sharding: + - shards/rewrites scale buffers + - registers the quantized shard load hook + """ + + @abstractmethod + def scale_names(self) -> List[str]: ... + + def shard_scales( + self, + dim: int, + rank: int, + world_size: int, + weight_shape: torch.Size, + **scales: torch.Tensor, + ) -> Dict[str, torch.Tensor]: + return {k: v for k, v in scales.items() if isinstance(v, torch.Tensor)} + + def shard_load_hook( + self, + state_dict, + prefix, + *args, + weight_name: str, + weight_shape: torch.Size, + dim: int, + rank: int, + world_size: int, + ) -> None: + return + + def quantization_cb( + self, + gm: GraphModule, + submod: nn.Module, + node: Node, + weight_key: str, + weight_new_shape: torch.Size, + dim: int, + rank: int, + world_size: int, + ) -> None: + scales = {} + for scale_name in self.scale_names(): + scales[scale_name] = submod.get_buffer(scale_name) + scales["weight_shape"] = weight_new_shape + sharded_scales = self.shard_scales(dim, rank, world_size, **scales) + for k, v in sharded_scales.items(): + submod.register_buffer(k, v) + + gm._register_load_state_dict_pre_hook( + partial( + self.shard_load_hook, + weight_name=weight_key, + weight_shape=weight_new_shape, + dim=dim, + rank=rank, + world_size=world_size, + ) + ) + + +class FP8WeightShardingInfo(QuantizationShardingMixin, WeightShardingInfo): + """Tensor-parallel sharding for FP8-quantized linears.""" + + def scale_names(self) -> List[str]: + return ["input_scale", "weight_scale"] + + def shard_scales( + self, + dim: int, + rank: int, + world_size: int, + weight_shape: torch.Size, + *, + input_scale: torch.Tensor, + weight_scale: torch.Tensor, + ) -> Dict[str, torch.Tensor]: + return { + "input_scale": input_scale, + "weight_scale": weight_scale, + } + + def shard_load_hook( + self, + state_dict, + prefix, + *args, + weight_name: str, + weight_shape: torch.Size, + dim: int, + rank: int, + world_size: int, + ) -> None: + return + + +def _shard_fp4_weight_scale(weight_scale, sharded_uint8_weight_shape, dim, rank, world_size): + assert weight_scale.dim() == 1 + weight_shape_original = list(sharded_uint8_weight_shape) + weight_shape_original[dim] = weight_shape_original[dim] * world_size + weight_shape_original[-1] *= 2 + modelopt_weight_scale = cutlass_fp4_scale_to_modelopt_fp4_scale( + weight_scale, tuple(weight_shape_original) + ) + return modelopt_fp4_scale_to_cutlass_fp4_scale( + modelopt_weight_scale.tensor_split(world_size, dim=dim)[rank] + ) + + +class FP4WeightShardingInfo(QuantizationShardingMixin, WeightShardingInfo): + """Tensor-parallel sharding for FP4-quantized linears.""" + + def scale_names(self) -> List[str]: + return ["input_scale", "weight_scale", "alpha"] + + def shard_scales( + self, + dim: int, + rank: int, + world_size: int, + weight_shape: torch.Size, + *, + weight_scale: torch.Tensor, + alpha: torch.Tensor, + input_scale: torch.Tensor, + ) -> Dict[str, torch.Tensor]: + return { + "alpha": alpha, + "input_scale": input_scale, + "weight_scale": _shard_fp4_weight_scale( + weight_scale, weight_shape, dim, rank, world_size + ), + } + + def shard_load_hook( + self, + state_dict, + prefix, + *args, + weight_name: str, + weight_shape: torch.Size, + dim: int, + rank: int, + world_size: int, + ) -> None: + key = weight_name + "_scale" + if key in state_dict: + state_dict[key] = _shard_fp4_weight_scale( + state_dict[key], weight_shape, dim, rank, world_size + ) + + +class BMMShardingInfo(ShardingTransformInfo): + """Configuration for BMM sharding transformations.""" + + start_idx: int + end_idx: int + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + """Validate the transformation configuration.""" + if not is_op(node, torch.ops.aten.bmm): + ad_logger.warning(f"BMM sharding is only supported for BMM nodes. Skipping {self}.") + return False + + # Get the input tensors + lhs_tensor = node.args[0] + rhs_tensor = node.args[1] + + # Check batch sizes from meta information + lhs_batch_size = lhs_tensor.meta["val"].shape[0] + rhs_batch_size = rhs_tensor.meta["val"].shape[0] + + assert lhs_batch_size == rhs_batch_size, "Batch sizes of both tensors must match" + bmm_batch_size = lhs_batch_size + + # Check if the distribution is balanced + remainder = bmm_batch_size % self.config.world_size + + # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather doesn't support uneven splits at the moment. + if remainder: + ad_logger.warning( + f"BMM batch size {bmm_batch_size} is not divisible by world size {self.config.world_size}. " + f"This will result in uneven distribution of work across devices. Skipping." + ) + return False + return True + + def apply(self, gm: GraphModule, node: Node) -> None: + """Apply BMM sharding transformation to the graph module.""" + + def handle_tensor( + bmm_node: Node, tensor_node: Node, arg_idx: int, start_idx: int, end_idx: int + ): + """Unified helper function to shard either a parameter tensor or a dynamic tensor. + + Args: + bmm_node: The BMM node that is being processed + tensor_node: The input tensor node to shard + arg_idx: The argument index of the tensor in the BMM node + start_idx: Start index for sharding + end_idx: End index for sharding + """ + + # Define slice function for the sharding + def slice_tensor(t: torch.Tensor) -> torch.Tensor: + return t[start_idx:end_idx] + + if tensor_node.op == "get_attr": + # Handle parameter tensor + weight_key = tensor_node.target + modname, _, param_name = weight_key.rpartition(".") + param = gm.get_parameter(weight_key) + + # Update the parameter with its shard + param_new = nn.Parameter(slice_tensor(param).detach().clone(), requires_grad=True) + gm.get_submodule(modname).register_parameter(param_name, param_new) + + # Register load state dict hook + gm._register_load_state_dict_pre_hook( + partial( + _load_hook, + f_split=slice_tensor, + param_key=weight_key, + param_shape=param_new.shape, + ) + ) + else: + # Handle dynamic tensor + with gm.graph.inserting_before(bmm_node): + tensor_slice = gm.graph.call_function( + torch.ops.aten.slice.Tensor, args=(tensor_node, 0, start_idx, end_idx, 1) + ) + # Update BMM node to use the sliced tensor + bmm_node.update_arg(arg_idx, tensor_slice) + + # Get the input tensors + lhs_tensor = node.args[0] + rhs_tensor = node.args[1] + # Handle both tensors + handle_tensor(node, lhs_tensor, 0, self.start_idx, self.end_idx) + handle_tensor(node, rhs_tensor, 1, self.start_idx, self.end_idx) + + # Add all_gather node after BMM to collect results + with gm.graph.inserting_after(node): + gather_node = gm.graph.call_function( + torch.ops.auto_deploy.torch_dist_all_gather.default, + args=(node, 0), # Gather along batch dimension (0) + ) + node.replace_all_uses_with(gather_node) + gather_node.replace_input_with(gather_node, node) + + +class EPShardingInfo(ShardingTransformInfo): + """Configuration for EP sharding transformations.""" + + mlp_type: MLPType + + @classmethod + def from_node(cls, node: Node, **kwargs) -> "EPShardingInfo": + """ + Create the correct EPShardingInfo subclass (FP8/NVFP4/base) based on `node`. + """ + subcls = _resolve_ep_cls_from_node(node) + return subcls(target_node=node.name, **kwargs) + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + """Validate the transformation configuration.""" + if not is_op(node, torch.ops.auto_deploy.torch_moe): + ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") + return False + return True + + def apply(self, gm: GraphModule, node: Node) -> None: + """Apply EP sharding transformation to the graph module.""" + _insert_sharded_moe(gm, node, self.config, mlp_type=self.mlp_type) + + +class MXFP4EPShardingInfo(EPShardingInfo): + """GPT-OSS style MXFP4-specific EP sharding behavior.""" + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + """Validate the transformation configuration.""" + if not is_op(node, torch.ops.auto_deploy.triton_mxfp4_moe): + ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") + return False + return True + + def apply(self, gm: GraphModule, node: Node) -> None: + _insert_sharded_mxfp4_mlp_ep(gm, node, self.config) + + +class FP8EPShardingInfo(EPShardingInfo, QuantizationShardingMixin): + """FP8-specific EP sharding behavior.""" + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + if not is_op(node, torch.ops.auto_deploy.torch_quant_fp8_moe): + ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") + return False + return True + + def scale_names(self) -> List[str]: + return ["input_scale", "weight_scale"] + + def apply(self, gm: GraphModule, node: Node) -> None: + _insert_sharded_moe( + gm, + node, + self.config, + self.mlp_type, + scale_names=self.scale_names(), + ) + + +class NVFP4EPShardingInfo(EPShardingInfo, QuantizationShardingMixin): + """NVFP4-specific EP sharding behavior.""" + + def validate(self, gm: GraphModule = None, node: Node = None) -> bool: + if not is_op(node, torch.ops.auto_deploy.torch_quant_nvfp4_moe): + ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") + return False + return True + + def scale_names(self) -> List[str]: + return ["input_scale", "weight_scale", "alpha"] + + def apply(self, gm: GraphModule, node: Node) -> None: + _insert_sharded_moe(gm, node, self.config, self.mlp_type, scale_names=self.scale_names()) + + +EP_SHARDING_RULES = [ + (lambda n: is_op(n, torch.ops.auto_deploy.torch_quant_fp8_moe), FP8EPShardingInfo), + (lambda n: is_op(n, torch.ops.auto_deploy.torch_quant_nvfp4_moe), NVFP4EPShardingInfo), + (lambda n: is_op(n, torch.ops.auto_deploy.torch_moe), EPShardingInfo), + (lambda n: is_op(n, torch.ops.auto_deploy.triton_mxfp4_moe), MXFP4EPShardingInfo), +] + + +def _resolve_ep_cls_from_node(node: Node) -> type[EPShardingInfo]: + for pred, cls in EP_SHARDING_RULES: + try: + if pred(node): + return cls + except Exception: + # Missing op variant in this build or other harmless issues — keep trying. + pass + return EPShardingInfo + + +######################################################## +# Transform API classes +######################################################## + + +@TransformRegistry.register("detect_sharding") +class Sharding(BaseTransform): + """A transformation to apply sharding to the model following tensor parallelism. + + The transformation is based on the following steps: + + 1. Identify boundary nodes between residual nodes to identify shardable regions. + 2. Identify the GEMM nodes that can be sharded + 3. Trace through the subgraph using DFS/BFS between each pair of boundary nodes + 4. Account for each node in the trace to ensure the op is correct even after sharding. This is + necessary to ensure that the sharding is correct and we need to be able to account for + **all** nodes in the subgraph. The subgraph here is defined as the region between the first + linear node to the last linear node of an identified sharding region. + # 5. Shard the GEMM nodes or skip accordingly. + + min_local_shape is the minimum size of the local tensor shard, to prevent TP parallelism + splitting, e.g., the individual heads into smaller shards. + """ + + config: ShardingTransformConfig + + @classmethod + def get_config_class(cls) -> Type[TransformConfig]: + return ShardingTransformConfig + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + local_rank, world_size = shared_config.local_rank, shared_config.world_size + assert isinstance(gm, GraphModule), "Expecting GraphModule" + config = self.config + config.factory_config = factory.get_sharding_config() if factory else {} + config.rank = local_rank + config.world_size = world_size + # validate the config + config.validate_config() + # initialize the transform container + transform_container = ShardingTransformContainer(config=config) + shared_config.sharding_transform_container = transform_container + ad_logger.info( + f"Using allreduce strategy: {config.allreduce_strategy.name}, dist backend: {config.dist_backend}" + ) + + if world_size < 2: + ad_logger.info("Skipping sharding for single device") + return gm, TransformInfo( + skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True + ) + + info = TransformInfo(skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True) + for source in config.sharding_source: + if source == ShardingSource.FACTORY: + if len(config.factory_config) == 0: + ad_logger.debug( + "No factory config found. Skipping sharding from factory config" + ) + continue + ad_logger.info("Applying sharding from factory config") + info += detect_sharding_from_config(gm, transform_container, ShardingSource.FACTORY) + elif source == ShardingSource.MANUAL: + if len(config.manual_config) == 0: + ad_logger.debug("No manual config found. Skipping sharding from manual config") + continue + ad_logger.info("Applying sharding from manual config") + info += detect_sharding_from_config(gm, transform_container, ShardingSource.MANUAL) + + elif source == ShardingSource.HEURISTIC: + ad_logger.info(f"Running autodeploy sharding heuristics: {config.sharding_dims}") + # run TP sharding across ranks + if ShardingDim.TP in config.sharding_dims: + info += detect_column_row_shard(gm, transform_container) + + # run EP sharding across ranks + if ShardingDim.EP in config.sharding_dims: + info += detect_ep_shard(gm, transform_container) + + # run BMM sharding across ranks + if ShardingDim.BMM in config.sharding_dims: + info += detect_dp_bmm_shard(gm, transform_container) + + return gm, info + + @TransformRegistry.register("sharding_transform_executor") class ShardingTransformExecutor(BaseTransform): """Apply transformations to the graph module. @@ -156,17 +844,1092 @@ class ShardingTransformExecutor(BaseTransform): return gm, info -def _process_simple_shard( - nodes_linear: Union[Dict[Node, List[Node]], List[Node]], +class ShardingTransformContainer(BaseModel): + """Configuration for sharding the model.""" + + config: ShardingTransformConfig = Field(default_factory=ShardingTransformConfig) + weight_sharding_transforms: List[WeightShardingInfo] = Field(default_factory=list) + parameter_update_transforms: List[ParameterUpdateInfo] = Field(default_factory=list) + bmm_transforms: List[BMMShardingInfo] = Field(default_factory=list) + ep_transforms: List[EPShardingInfo] = Field(default_factory=list) + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._transform_list_dict = { + WeightShardingInfo: self.weight_sharding_transforms, + BMMShardingInfo: self.bmm_transforms, + EPShardingInfo: self.ep_transforms, + ParameterUpdateInfo: self.parameter_update_transforms, + } + + def add(self, transform: ShardingTransformInfo) -> bool: + """Append a transform only if that node was + not sharded before. Do not overwrite existing transforms. + """ + # Find the appropriate list by checking inheritance + transform_list = None + for base_class, transform_list_candidate in self._transform_list_dict.items(): + if isinstance(transform, base_class): + transform_list = transform_list_candidate + break + + if transform_list is None: + raise ValueError(f"Unknown transform type: {type(transform)}") + + # Check if node already has a transform + for existing_transform in transform_list: + if existing_transform.target_node == transform.target_node: + return False + transform_list.append(transform) + return True + + +######################################################## +# Helper functions +######################################################## + + +def _load_hook( + state_dict, + prefix, + *args, + f_split: Callable[[torch.Tensor, int], torch.Tensor], + param_key: str, + param_shape: torch.Size, +): + # TODO: we need to support loading either a sharded or unsharded checkpoint. + # Otherwise, basic workflows like + # model.load_state_dict(model.state_dict()) will fail. + # This is quite a hacky solution. A better solution would be to store extra_state in + # the state_dict to identify whether the state_dict is sharded or not. + key = prefix + param_key + ad_logger.debug(f"Sharder LOAD hook is called for '{key}'") + if key not in state_dict: + return + p_to_load = state_dict[key] + + p_to_load = p_to_load if param_shape == p_to_load.shape else f_split(p_to_load) + + state_dict[key] = p_to_load + + +def _load_hook_remove( + state_dict: Dict, + prefix: str, + *args, + param_key: str, +): + key = prefix + param_key + ad_logger.debug(f"Sharder LOAD hook is called for '{key}'") + state_dict.pop(key, None) + + +def validate_allreduce_strategy(v): + """Convert string names like 'AUTO' to AllReduceStrategy enum. + + This is a shared validator for allreduce_strategy fields across all config classes. + + Args: + v: Value to validate - can be AllReduceStrategy enum, string name, or integer value + + Returns: + AllReduceStrategy enum value + + Raises: + ValueError: If the input is an invalid strategy string + """ + if isinstance(v, AllReduceStrategy): + return v + if isinstance(v, str): + # Try to get enum by name + try: + return AllReduceStrategy[v] + except KeyError: + raise ValueError( + f"Invalid allreduce strategy: {v}. " + f"Valid options: {', '.join(s.name for s in AllReduceStrategy)}" + ) + if isinstance(v, int): + return AllReduceStrategy(v) + return v # Let Pydantic handle other types + + +def _get_dist_ops(backend: str): + """Get the appropriate distributed ops based on backend availability. + + Args: + backend: The distributed backend to use. Can be 'auto', 'trtllm', or 'torch'. + 'auto' will automatically select based on availability. + + Returns tuple of (all_gather_op, all_reduce_op) for the current backend. + """ + # Handle DistBackend enum or string + if hasattr(backend, "value"): + backend = backend.value + + if backend == "trtllm": + # Force TRT-LLM ops + return ( + torch.ops.auto_deploy.trtllm_dist_all_gather.default, + torch.ops.auto_deploy.trtllm_dist_all_reduce.default, + ) + elif backend == "torch": + # Force PyTorch distributed ops + return ( + torch.ops.auto_deploy.torch_dist_all_gather.default, + torch.ops.auto_deploy.torch_dist_all_reduce.default, + ) + else: # auto + # Automatically select based on availability + if is_trtllm_op_available(): + # Use TRT-LLM optimized ops in MPI mode + return ( + torch.ops.auto_deploy.trtllm_dist_all_gather.default, + torch.ops.auto_deploy.trtllm_dist_all_reduce.default, + ) + else: + # Use PyTorch distributed ops in demollm mode + return ( + torch.ops.auto_deploy.torch_dist_all_gather.default, + torch.ops.auto_deploy.torch_dist_all_reduce.default, + ) + + +def _validate_sharded_shapes( + node: Node, fused_weight_dims: Optional[list] = None, world_size: Optional[int] = None +) -> None: + """ + Update the shapes of the view nodes and the split node parameters to account for the TP sharding. + 1. After sharding weights of the linear node using column split + in attention module (Q, K, V), + the output Y = X @ W^T shape is [batch, seq, num_heads // TP_size, head_dim]. + Some models hardcode the shape of the output to [batch, seq, num_heads, head_dim] + instead of implicit [batch, seq, -1, head_dim]. + Detect such cases and update the shape of the view node accordingly. + 2. If the weights are fused (e.g,. QKV, gate_up, SSM, etc.), the follow-up split node parameters + need to be updated to account for the TP sharding. + """ + + # get the subgraph of this module. Subgraph boundary is the next linear node. + next_lin_node, _ = bfs(node, is_any_lin_op, include_root=False) + nodes_to_validate = subgraph( + [node], + include=lambda n: is_op(n, [torch.ops.aten.view, torch.ops.aten.reshape]), + boundary_condition=is_any_lin_op, + ) + for view_node in nodes_to_validate: + if len(view_node.args) < 2: + continue + if "sharded" in view_node.meta and view_node.meta["sharded"]: + continue + view_shape = list(view_node.args[1]) + if not isinstance(view_shape, list): + continue + if len(view_shape) >= 3 and isinstance(view_shape[2], int) and view_shape[2] != -1: + args = list(view_node.args) + view_shape[2] = -1 # view_shape[2] // world_size + args[1] = tuple(view_shape) + view_node.args = tuple(args) + view_node.meta["sharded"] = True + ad_logger.debug(f"\nUpdated view node {view_node} arguments to {view_node.args}") + + # if fused_weight_dims is provided, we need to update all split sizes + if fused_weight_dims is not None: + assert world_size is not None, "World size is required to update the split node params" + assert len(node.users) == 1, "Fused linear node should have only one user: a split node" + # find all split nodes in the region between this linear node and the next + split_nodes = subgraph( + [node], + [next_lin_node], + include=lambda n: is_op(n, [torch.ops.aten.split, torch.ops.aten.split_with_sizes]), + ) + for split_node in split_nodes: + orig_sizes = split_node.args[1] + new_sizes = [orig_sizes[i] // world_size for i in range(len(orig_sizes))] + args = list(split_node.args) + args[1] = new_sizes + split_node.args = tuple(args) + ad_logger.debug(f"\nUpdated split node {split_node} arguments to {split_node.args}") + + +TP_SHARDING_RULES = [ + (lambda n: is_op(n, torch.ops.auto_deploy.torch_fake_quant_fp8_linear), FP8WeightShardingInfo), + ( + lambda n: is_op(n, torch.ops.auto_deploy.torch_fake_quant_nvfp4_linear), + FP4WeightShardingInfo, + ), +] + + +def _resolve_tp_cls_from_node(node: Node): + for pred, cls in TP_SHARDING_RULES: + try: + if pred(node): + return cls + except Exception: + pass + return WeightShardingInfo + + +def _transform_bmm_moe_weight_param( + gm: GraphModule, + param_node: Node, + lo: int, + hi: int, + swap_gate_up: bool = False, +) -> None: + """Transform a parameter for BMM MoE: slice experts, optionally swap gate/up, transpose. + This modifies the parameter in-place and registers a load hook. + Does NOT create graph nodes - those should be created separately by the caller. + Args: + gm: Graph module + param_node: The get_attr node for the parameter + lo: Start index for expert slicing + hi: End index for expert slicing + swap_gate_up: If True, swap W1 and W3 (Llama4 -> TRT-LLM format) + """ + if param_node.op != "get_attr": + return # Only works on parameters + + param_key = str(param_node.target) + modname, _, param_name = param_key.rpartition(".") + submod = gm.get_submodule(modname) if modname else gm + full_param = getattr(submod, param_name) + + # Slice the parameter along expert dimension (dim 0) + sliced_param = full_param[lo:hi].detach().clone() + + # Swap W1 and W3 if needed (for gate_up weights) + # Llama4: (E, H, 2*I) with [W1, W3], TRT-LLM wants [W3, W1] + if swap_gate_up and sliced_param.ndim == 3: + intermediate_size = sliced_param.shape[2] // 2 + w1 = sliced_param[:, :, :intermediate_size] + w3 = sliced_param[:, :, intermediate_size:] + sliced_param = torch.cat([w3, w1], dim=2) + + # Transpose: Llama4 (E, H, X) -> TRT-LLM (E, X, H) + transposed_param = sliced_param.transpose(1, 2) + transposed_shape = transposed_param.shape + + # Define transformation function for load hook + def transform_tensor(t: torch.Tensor) -> torch.Tensor: + t_sliced = t[lo:hi] + if swap_gate_up and t_sliced.ndim == 3: + intermediate_size = t_sliced.shape[2] // 2 + w1 = t_sliced[:, :, :intermediate_size] + w3 = t_sliced[:, :, intermediate_size:] + t_sliced = torch.cat([w3, w1], dim=2) + return t_sliced.transpose(1, 2).contiguous() + + # Register load hook + gm._register_load_state_dict_pre_hook( + partial( + _load_hook, + f_split=transform_tensor, + param_key=param_key, + param_shape=transposed_shape, + ) + ) + + # Replace the parameter with the transformed version + new_param = nn.Parameter(transposed_param, requires_grad=False) + setattr(submod, param_name, new_param) + + +def _get_dim0_from_arg(gm: GraphModule, arg: Union[Node, torch.Tensor]) -> int: + """Helper to get the first dimension size of an argument (Node or Tensor).""" + if isinstance(arg, torch.Tensor): + return arg.shape[0] + if isinstance(arg, Node): + if arg.op == "get_attr": + # Traverse attributes to find the tensor + obj = gm + for atom in arg.target.split("."): + obj = getattr(obj, atom) + return obj.shape[0] + if "val" in arg.meta: + return arg.meta["val"].shape[0] + raise ValueError(f"Cannot determine shape[0] for {arg}") + + +def get_all_weights_in_subgraph( + sources: list[Node], + sinks: list[Node], +): + """Get all weight nodes (get_attr nodes) in the subgraph between sources and sinks.""" + weight_nodes = subgraph(sources, sinks, include=lambda n: n.op == "get_attr") + return weight_nodes + + +def init_process_grid_from_config( + config: ShardingTransformConfig, +) -> Dict[ShardingDim, Dict[str, int]]: + rank, world_size = config.rank, config.world_size + if len(config.process_grid) > 0: + ad_logger.debug(f"EP + TP sharding process grid: {config.process_grid}") + ep_size = config.process_grid[ShardingDim.EP] + tp_size = config.process_grid[ShardingDim.TP] + # the order of the keys (ep,tp) vs (tp,ep) determines how ranks + # are mapped to the 2D process grid + if list(config.process_grid.keys())[-1] == ShardingDim.TP: + tp_rank = rank % tp_size + ep_rank = rank // tp_size + else: + tp_rank = rank // ep_size + ep_rank = rank % ep_size + + if ep_size * tp_size != world_size: + ad_logger.warning( + f"EP + TP sharding process grid {config.process_grid} " + f"does not match world size {world_size}. " + f"Skipping 2D sharding, applying only 1D EP sharding." + ) + ep_size = world_size + tp_size = 1 + ep_rank = rank + tp_rank = 0 + else: + ep_size = world_size + tp_size = 1 + ep_rank = rank + tp_rank = 0 + process_grid = { + ShardingDim.EP: {"p": ep_rank, "w": ep_size}, + ShardingDim.TP: {"p": tp_rank, "w": tp_size}, + } + config.process_grid = process_grid + return process_grid + + +def _canonicalize_node_args(node: Node) -> list: + """ + Canonicalize the node's arguments. + Actions performed: + - Flatten list arguments + """ + new_args = list(node.args) + for i in range(len(new_args)): + # In FX graphs, the list might be a Node representing a list() call + if isinstance(new_args[i], Node): + # Check if this is a list() call node + if new_args[i].target is list and len(new_args[i].args) == 1: + new_args[i] = new_args[i].args[0] + if isinstance(new_args[i], (list, tuple)): + if len(new_args[i]) == 1: + new_args[i] = new_args[i][0] + + return new_args + + +######################################################## +# Sharding transform functions +######################################################## +def shard_weight_tensor( + gm: GraphModule, + weight_tensor: torch.Tensor, + param_key: str, + dim: int, rank: int, world_size: int, + min_local_shape: int = 1, + fused_weight_dims: Optional[list] = None, + requires_grad: bool = False, + custom_shard_fn: Optional[Callable[[torch.Tensor], torch.Tensor]] = None, +) -> Tuple[torch.Tensor, torch.Size]: + """Shard a weight tensor across ranks and register load hook. + + Args: + gm: GraphModule containing the weight + weight_tensor: The weight tensor to shard + param_key: Parameter key for registering load hook + dim: Dimension to shard along + rank: Current rank + world_size: Total number of ranks + min_local_shape: Minimum local shape constraint (for GQA) + fused_weight_dims: List of dimensions for fused weights + custom_shard_fn: Optional custom function to shard the tensor + requires_grad: Whether the parameter should require gradients + + Returns: + Tuple of (sharded_tensor, sharded_shape) + """ + + def split_tensor( + t: torch.Tensor, + d: int = dim, + r: int = rank, + ws: int = world_size, + min_d_shape: int = min_local_shape, + ) -> torch.Tensor: + # The local tensor shape has to be divisible by min_d_shape + max_split_size = t.shape[d] // min_d_shape + if ws > max_split_size: + num_groups = math.ceil(ws / max_split_size) + ad_logger.debug( + f"World size {ws} is greater than the max split size {max_split_size}. " + + f"Splitting tensor to {num_groups} chunks" + ) + return torch.tensor_split(t, max_split_size, dim=d)[r // num_groups] + return torch.tensor_split(t, ws, dim=d)[r] + + # Handle fused weights + if fused_weight_dims is not None: + + def split_fused_tensor( + t: torch.Tensor, + fused_dims: list = fused_weight_dims, + d: int = dim, + ) -> torch.Tensor: + # dim_d = t.shape[d] + # num_parts = 1 + # part_size = dim_d // num_parts + # fused_dims = [part_size] * num_parts + return torch.cat( + [split_tensor(w) for w in torch.split(t, fused_dims, dim=d)], + dim=d, + ) + + f_split = split_fused_tensor + else: + f_split = split_tensor + + sharded_weight = f_split(weight_tensor) + sharded_shape = sharded_weight.shape + + # Register load hook + gm._register_load_state_dict_pre_hook( + partial( + _load_hook, + f_split=f_split, + param_key=param_key, + param_shape=sharded_shape, + ) + ) + + # Update the parameter in the module + modname, _, param_name = param_key.rpartition(".") + submod = gm.get_submodule(modname) + param_new = nn.Parameter(sharded_weight.detach().clone(), requires_grad=requires_grad) + setattr(submod, param_name, param_new) + + return sharded_weight, sharded_shape + + +def _shard_parameter_node( + gm: GraphModule, + node: Node, + dim: int, + config: ShardingTransformConfig, + add_dist: bool = False, + min_local_shape: int = 1, + fused_weight_dims: Optional[list] = None, + quantization_cb: Optional[ + Callable[[GraphModule, nn.Module, Node, str, torch.Size, int, int, int], None] + ] = None, +) -> None: + """Replace the node with parametrized weight tensor with a new node that accepts sharded weights. + + The state_dict is also updated to contain the sharded weights. + """ + assert dim in [0, 1], "Only dim 0 and 1 are supported for sharding" + assert add_dist or dim == 0, "For dim=1 sharding, dist_op is required." + + rank, world_size = config.rank, config.world_size + allreduce_strategy = config.allreduce_strategy.name + num_users = num_users_of_weight_node(node) + if num_users > 1 or num_users == 0: + ad_logger.warning( + f"Weight node {node} has {num_users} users. This is not supported for sharding. Skipping." + ) + return + # get weight and bias key + weight_key, bias_key = extract_param_names_from_node(node) + + modname = weight_key.rpartition(".")[0] + submod = gm.get_submodule(modname) + + # Shard weight using the unified function (also updates the parameter) + original_weight = gm.get_parameter(weight_key) + _, weight_new_shape = shard_weight_tensor( + gm=gm, + weight_tensor=original_weight, + param_key=weight_key, + dim=dim, + rank=rank, + world_size=world_size, + min_local_shape=min_local_shape, + fused_weight_dims=fused_weight_dims, + ) + + if bias_key is not None and dim == 0: + # update bias for dim 0 --> we can handle it like the weight + original_bias = gm.get_parameter(bias_key) + shard_weight_tensor( + gm=gm, + weight_tensor=original_bias, + param_key=bias_key, + dim=dim, + rank=rank, + world_size=world_size, + min_local_shape=min_local_shape, + fused_weight_dims=fused_weight_dims, + ) + elif bias_key is not None and rank != world_size - 1: + # update the bias for dim 1 --> in this case only the last rank gets the bias to avoid + # double counting it. For all other we will delete the bias. + args = list(node.args) + node_bias = args[2] + args[2] = None + node.args = tuple(args) + gm.graph.erase_node(node_bias) + bias_param_name = bias_key.rpartition(".")[-1] + setattr(submod, bias_param_name, None) + gm._register_load_state_dict_pre_hook(partial(_load_hook_remove, param_key=bias_key)) + + if quantization_cb is not None: + quantization_cb( + gm=gm, + submod=submod, + node=node, + weight_key=weight_key, + weight_new_shape=weight_new_shape, + dim=dim, + rank=rank, + world_size=world_size, + ) + + # # # column shard with no gather: the output is sharded + if not add_dist: + return + + # figure out the right dist op (backend-aware) + all_gather_op, all_reduce_op = _get_dist_ops(config.dist_backend) + dist_lookup = { + 0: (all_gather_op, -1), + 1: (all_reduce_op, allreduce_strategy), + } + fn_dist, *dist_args = dist_lookup[dim] + + # add reduction node + with gm.graph.inserting_after(node): + dist_node = gm.graph.call_function(fn_dist, args=(node,) + tuple(dist_args)) + node.replace_all_uses_with(dist_node) + dist_node.replace_input_with(dist_node, node) + + +def _update_node_args(node: Node, args: tuple) -> None: + """Update the node's arguments with the new sharded arguments.""" + if "sharded" in node.meta and node.meta["sharded"]: + return + node.args = args + node.meta["sharded"] = True + ad_logger.debug( + f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}." + ) + + +def _insert_sharded_mamba( + gm: GraphModule, + entry_node: Node, + dim: int, + config: ShardingTransformConfig, + min_local_shape: int, + weights_to_shard: Optional[list[str]] = None, + weight_shard_dims: Optional[Dict[str, int]] = None, + fused_weight_dims: Optional[Dict[str, list]] = None, + quantization_cb: Optional[ + Callable[[GraphModule, nn.Module, Node, str, torch.Size, int, int, int], None] + ] = None, +) -> bool: + """ + To shard Mamba layer, first column-shard the first linear layer: entry_node, + then shard all remaining weight tensors found in the subgraph defined between + entry_node and the next successor linear node. + First, validate if this is indeed a mamba module: within the subgraph, + there should be an torch_ssm node and conv1d node. + + Args: + gm: GraphModule + entry_node: The first linear node of the Mamba layer + dim: Default shard dimension + allreduce_strategy: AllReduceStrategy + min_local_shape: Minimum local shape constraint + weights_to_shard: Optional list of regex patterns to match weight names + weight_shard_dims: Optional dict mapping weight keys to their shard dimensions + fused_weight_dims: Optional dict mapping weight keys to their fused dimension lists + quantization_cb: Optional quantization callback + """ + # Find next linear node to define subgraph boundary + try: + next_lin_node, depth = bfs(entry_node, is_any_lin_op, include_root=False) + except RuntimeError: + ad_logger.warning("Could not find next linear node after entry_node for Mamba sharding") + return False + + rank, world_size = config.rank, config.world_size + # Get subgraph between entry_node and next linear node + subgraph_nodes = subgraph([entry_node], [next_lin_node]) + + ############################################################## + ########## validate if this is a valid Mamba module ########## + ############################################################## + # has_ssm = any(is_op(n, torch.ops.auto_deploy.mamba.torch_ssm_transform) for n in subgraph_nodes) + has_ssm = True + conv1d_nodes = [ + n + for n in subgraph_nodes + if is_op(n, [torch.ops.aten.conv1d, torch.ops.auto_deploy.torch_causal_conv1d]) + ] + if len(conv1d_nodes) != 1 or not has_ssm: + ad_logger.warning( + f"Subgraph does not contain exactly one conv1d node and torch_ssm_transform. " + f"Skipping Mamba sharding. conv1d_nodes={conv1d_nodes}, has_ssm={has_ssm}" + ) + return False + + ############################################################## + ########## infer split sizes for in_proj and conv1d ########## + ############################################################## + # in_proj and conv1d are most likely fused, followed up by split nodes. Infer split sizes: + if fused_weight_dims is None: + split_nodes = [ + n + for n in subgraph_nodes + if is_op(n, [torch.ops.aten.split, torch.ops.aten.split_with_sizes]) + ] + if len(split_nodes) != 2: + ad_logger.warning( + f"Subgraph does not contain exactly two split nodes. " + f"Skipping Mamba sharding. split_nodes={split_nodes}" + ) + return False + split_sizes_1 = split_nodes[0].args[1] + split_sizes_2 = split_nodes[1].args[1] + if split_sizes_1[1] != sum(split_sizes_2): + ad_logger.warning( + f"Split nodes have different sizes. " + f"Skipping Mamba sharding. split_sizes_1={split_sizes_1}, split_sizes_2={split_sizes_2}" + ) + return False + fused_weight_dims = { + "in_proj": split_sizes_1[0:1] + split_sizes_2 + split_sizes_1[2:], + "conv1d": split_sizes_2, + } + + conv1d_node = conv1d_nodes[0] + # conv1d_node last argument is the number of output channels. + # This one is also sharded, so we need to update this parameter + conv_args = list(conv1d_node.args) + conv_args[-1] = conv1d_node.args[-1] // world_size + conv1d_node.args = tuple(conv_args) + + # First, shard the entry_node (the first linear layer) + # Extract entry node's fused_weight_dims by matching weight name against patterns + entry_fused_dims = None + if fused_weight_dims: + entry_weight_key, _ = extract_param_names_from_node(entry_node) + for pattern, dims in fused_weight_dims.items(): + if re.search(pattern, entry_weight_key): + entry_fused_dims = dims + break + + _shard_parameter_node( + gm=gm, + node=entry_node, + dim=SplitDimension.COLUMN, + config=config, + add_dist=False, + min_local_shape=min_local_shape, + fused_weight_dims=entry_fused_dims, + quantization_cb=quantization_cb, + ) + + # Get all weight nodes in the subgraph except for out_proj + weight_nodes = [ + n + for n in get_all_weights_in_subgraph([entry_node], [next_lin_node]) + if "out_proj" not in str(n) + ] + + # Shard remaining weights, such as conv1d or RMSNorm + for weight_node in weight_nodes: + weight_key = weight_node.target + + # Filter by regex patterns if provided + if weights_to_shard is not None: + if not any(pattern in weight_key for pattern in weights_to_shard): + continue + + # Determine shard dimension for this weight + shard_dim = weight_shard_dims.get(weight_key, dim) if weight_shard_dims else dim + + # Get the weight parameter + try: + weight_param = gm.get_parameter(weight_key) + except AttributeError: + ad_logger.debug(f"Could not get parameter for {weight_key}, skipping") + continue + + # Get fused dims for this weight if specified + fused_dims = None + for k, v in fused_weight_dims.items(): + if k in weight_key: + fused_dims = v + break + + # Shard the weight tensor (also updates the parameter in the module) + _, sharded_shape = shard_weight_tensor( + gm=gm, + weight_tensor=weight_param, + param_key=weight_key, + dim=shard_dim, + rank=rank, + world_size=world_size, + min_local_shape=min_local_shape, + fused_weight_dims=fused_dims, + ) + + ad_logger.debug( + f"Sharded weight {weight_key} on dim {shard_dim}: " + f"{weight_param.shape} -> {sharded_shape}" + ) + + +def _insert_sharded_moe_stacked( + gm: GraphModule, + node: Node, + rank: int, + world_size: int, + allreduce_strategy: AllReduceStrategy, + scale_names: Sequence[str] = (), +): + """Update the torch_moe node with sliced stacked weight tensors, + sharded `selected_experts` and `final_scales(router_logics)`. + Add an all_reduce node after the moe node. + + For torch_moe with stacked tensor format (single-element lists containing 3D tensors). + + NOTE: allreduce_strategy is MANDATORY and must be explicitly provided. + """ + if allreduce_strategy is None: + raise ValueError(f"allreduce_strategy must be set for MoE sharding on node {node.name}") + + # Extract the stacked tensors from single-element lists + # args[3] = w1_weight (Node representing list with one 3D tensor, or direct list) + # args[4] = w2_weight (Node representing list with one 3D tensor, or direct list) + + # Helper to extract tensor node from list (handles both Node and direct list) + def extract_tensor_from_list_arg(list_arg): + if isinstance(list_arg, Node) and list_arg.target is list: + # It's a list() call node - extract from its args + return list_arg.args[0][0] # args[0] is the list content, [0] is first element + elif isinstance(list_arg, (list, tuple)): + # Direct list + return list_arg[0] + else: + raise ValueError(f"Unexpected list format: {type(list_arg)}") + + w3_w1_tensor_node = extract_tensor_from_list_arg(node.args[3]) + w2_tensor_node = extract_tensor_from_list_arg(node.args[4]) + num_experts = _get_dim0_from_arg(gm, w3_w1_tensor_node) + + args = list(node.args) + + # -- Handle selected_experts and final_scales sharding -- + selected_experts = args[1] + final_scales = args[2] + + experts_per_rank = num_experts // world_size + + with gm.graph.inserting_before(node): + lower = experts_per_rank * rank + # selected_experts_local = selected_experts - low + selected_experts_local = gm.graph.create_node( + "call_function", operator.sub, args=(selected_experts, lower), kwargs={} + ) + + # For num_experts % world_size != 0 case, + # assign the last (num_experts % world_size) experts to the last rank + div_node = gm.graph.create_node( + "call_function", operator.floordiv, args=(selected_experts, experts_per_rank), kwargs={} + ) + + comp_op = torch.ge if rank == world_size - 1 else torch.eq + rank_mask = gm.graph.create_node("call_function", comp_op, args=(div_node, rank), kwargs={}) + + # final_scales_local = final_scales * rank_mask + final_scales_local = gm.graph.create_node( + "call_function", operator.mul, args=(final_scales, rank_mask), kwargs={} + ) + + # -- Transform expert weight parameters -- + local_lo, local_hi = _split_range_last_remainder(num_experts, world_size, rank) + + # Transform w3_w1_stacked: slice experts, swap [W1,W3]->[W3,W1], transpose (E,H,2I)->(E,2I,H) + if isinstance(w3_w1_tensor_node, Node): + _transform_bmm_moe_weight_param( + gm, w3_w1_tensor_node, local_lo, local_hi, swap_gate_up=True + ) + + # Transform w2_stacked: slice experts, transpose (E,I,H)->(E,H,I) + if isinstance(w2_tensor_node, Node): + _transform_bmm_moe_weight_param(gm, w2_tensor_node, local_lo, local_hi, swap_gate_up=False) + + # -- Update args (keep same lists/nodes, just with transformed parameters) -- + args[1] = selected_experts_local + args[2] = final_scales_local + # args[3] and args[4] stay the same - we modified the parameters in-place + + ad_logger.debug( + f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}." + ) + + node.args = tuple(args) + + # -- add an all_reduce node -- + with gm.graph.inserting_after(node): + dist_node = gm.graph.call_function( + torch.ops.auto_deploy.torch_dist_all_reduce.default, + args=(node, allreduce_strategy), + ) + node.replace_all_uses_with(dist_node) + dist_node.replace_input_with(dist_node, node) + + +def _insert_sharded_moe( + gm: GraphModule, + node: Node, + config: ShardingTransformConfig, + mlp_type: MLPType, + scale_names: Sequence[str] = (), +): + """Update the torch_moe node with sharded weight lists or stacked tensors, + sharded `selected_experts` and `final_scales(router_logics)`. + Add an all_reduce node after the moe node. + + Handles both: + - Standard format: per-expert weight lists + - Stacked format: single-element lists containing stacked 3D tensors (Llama4 pattern) + + NOTE: allreduce_strategy is MANDATORY. + """ + # get 2D EP+TP process grid and corresponding ranks + ep_rank = config.process_grid[ShardingDim.EP]["p"] + ep_size = config.process_grid[ShardingDim.EP]["w"] + tp_rank = config.process_grid[ShardingDim.TP]["p"] + tp_size = config.process_grid[ShardingDim.TP]["w"] + allreduce_strategy = config.allreduce_strategy.name + args = list(node.args) + if allreduce_strategy is None: + raise ValueError(f"allreduce_strategy must be set for MoE sharding on node {node.name}") + scale_names = list(scale_names) + + flat_args = _canonicalize_node_args(node) + # we have two variants of MoE: stacked and listed: + # - stacked: w1, w2, w3 weight args are order-3 tensors, where the 1st dimension corresponds + # to the stacked expert weigthts. + # - listed: w1, w2, w3 weight args are lists of order-2 tensors, where each expert weight + # is a separate entry in the list. + if isinstance(flat_args[3], Node): + is_stacked = True + num_experts = flat_args[3].meta["val"].shape[0] + else: + is_stacked = False + num_experts = len(flat_args[3]) + args = list(node.args) + + # -- Handle selected_experts and final_scales sharding -- + selected_experts = args[1] + final_scales = args[2] + + experts_per_rank = num_experts // ep_size + + with gm.graph.inserting_before(node): + lower = experts_per_rank * ep_rank + # selected_experts_local = selected_experts - low + selected_experts_local = gm.graph.create_node( + "call_function", operator.sub, args=(selected_experts, lower), kwargs={} + ) + + # For num_experts % world_size != 0 case, + # assign the last (num_experts % world_size) experts to the last rank + # if rank == world_size -1: + # rank_mask = (selected_experts // experts_per_rank) >= rank + # else: + # rank_mask = (selected_experts // experts_per_rank) == rank + div_node = gm.graph.create_node( + "call_function", operator.floordiv, args=(selected_experts, experts_per_rank), kwargs={} + ) + comp_op = torch.ge if ep_rank == ep_size - 1 else torch.eq + rank_mask = gm.graph.create_node( + "call_function", comp_op, args=(div_node, ep_rank), kwargs={} + ) + + # final_scales_local = final_scales * rank_mask + final_scales_local = gm.graph.create_node( + "call_function", operator.mul, args=(final_scales, rank_mask), kwargs={} + ) + + args[1] = selected_experts_local + args[2] = final_scales_local + + if is_stacked: + # bmm-style stacked MoE: sharding is done by slicing the 1st dimension of the stacked weight tensor + # if mlp_type == MLPType.FUSED_GATED_MLP: + w_gate_up_stacked = flat_args[3] + w_down_stacked = flat_args[4] + local_lo, local_hi = _split_range_last_remainder(num_experts, ep_size, ep_rank) + _transform_bmm_moe_weight_param( + gm, w_gate_up_stacked, local_lo, local_hi, swap_gate_up=True + ) + _transform_bmm_moe_weight_param(gm, w_down_stacked, local_lo, local_hi, swap_gate_up=False) + else: + # listed MoE: sharding is done by taking a range of the listed weight tensors + + # -- Shard expert weights -- + def get_partition(lst, world_size, rank): + num_experts = len(lst) + expert_size_per_partition = num_experts // world_size + expert_start = rank * expert_size_per_partition + # For num_experts % world_size != 0 case, + # assign the last (num_experts % world_size) experts to the last rank + expert_end = ( + num_experts + if (rank == world_size - 1) + else expert_start + expert_size_per_partition + ) + return lst[expert_start:expert_end] + + w_up_list_sharded = get_partition(args[3], ep_size, ep_rank) + w_down_list_sharded = get_partition(args[4], ep_size, ep_rank) + w_gate_list_sharded = get_partition(args[5], ep_size, ep_rank) + + # if tp_size > 1, we do 2D EP+TP sharding. + # we add TP sharding of all expert weights. + for w_up in w_up_list_sharded + w_gate_list_sharded: + shard_weight_tensor( + gm=gm, + weight_tensor=gm.get_parameter(w_up.target), + param_key=w_up.target, + dim=SplitDimension.COLUMN, + rank=tp_rank, + world_size=tp_size, + ) + # here we don't need to add all-reduce: it's enough to have + # just one all-reduce after the whole EP+TP sharded MoE node. + for w_down in w_down_list_sharded: + shard_weight_tensor( + gm=gm, + weight_tensor=gm.get_parameter(w_down.target), + param_key=w_down.target, + dim=SplitDimension.ROW, + rank=tp_rank, + world_size=tp_size, + ) + + # -- Update args -- + args[3] = w_up_list_sharded + args[4] = w_down_list_sharded + args[5] = w_gate_list_sharded + + # Shard scales for quantized ops + for i in range(len(scale_names) * 3): # 3 layers (w1, w2, w3) × #scale_names per layer + args[6 + i] = get_partition(args[6 + i], ep_size, ep_rank) + + ad_logger.debug( + f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}." + ) + node.args = tuple(args) + + # -- add an all_reduce node -- + with gm.graph.inserting_after(node): + dist_node = gm.graph.call_function( + torch.ops.auto_deploy.torch_dist_all_reduce.default, args=(node, allreduce_strategy) + ) + node.replace_all_uses_with(dist_node) + dist_node.replace_input_with(dist_node, node) + + +def _slice_expert_dim(gm: GraphModule, tensor_node: Node, lo: int, hi: int) -> Node: + """Return tensor_node[lo:hi, ...] via aten.slice along dim 0.""" + with gm.graph.inserting_after(tensor_node): + # aten.slice.Tensor(self, dim, start, end, step) + return gm.graph.call_function( + torch.ops.aten.slice.Tensor, + args=(tensor_node, 0, lo, hi, 1), + ) + + +def _split_range_last_remainder(n: int, world_size: int, rank: int): + """[lo, hi) split along dim0; last rank gets remainder.""" + base = n // world_size + lo = base * rank + hi = n if rank == world_size - 1 else base * (rank + 1) + return lo, hi + + +def _insert_sharded_mxfp4_mlp_ep( + gm: GraphModule, + node: Node, + config: ShardingTransformConfig, +): + """ + Transform a call to auto_deploy::triton_mxfp4_moe into: + - sharded expert parameters along dim 0 (this rank's slice), + - call to auto_deploy::triton_mxfp4_moe_ep(..., local_lo, local_hi), + - followed by torch_dist_all_reduce. + + Expects the original op signature: + (hidden_states, + router_weight, router_bias, top_k, + gate_up_blocks, gate_up_bias, gate_up_scales, + alpha, limit, + down_blocks, down_bias, down_scales) + """ + + IDX_GATE_UP_BLOCKS = 4 + IDX_GATE_UP_BIAS = 5 + IDX_GATE_UP_SCALES = 6 + IDX_DOWN_BLOCKS = 9 + IDX_DOWN_BIAS = 10 + IDX_DOWN_SCALES = 11 + + gate_up_blocks_node = node.args[IDX_GATE_UP_BLOCKS] + num_experts = int(gate_up_blocks_node.meta["val"].shape[0]) + + rank, world_size = config.rank, config.world_size + local_lo, local_hi = _split_range_last_remainder(num_experts, world_size, rank) + + # Prepare new args with slices for this rank + args = list(node.args) + args[IDX_GATE_UP_BLOCKS] = _slice_expert_dim(gm, args[IDX_GATE_UP_BLOCKS], local_lo, local_hi) + args[IDX_GATE_UP_BIAS] = _slice_expert_dim(gm, args[IDX_GATE_UP_BIAS], local_lo, local_hi) + args[IDX_GATE_UP_SCALES] = _slice_expert_dim(gm, args[IDX_GATE_UP_SCALES], local_lo, local_hi) + args[IDX_DOWN_BLOCKS] = _slice_expert_dim(gm, args[IDX_DOWN_BLOCKS], local_lo, local_hi) + args[IDX_DOWN_BIAS] = _slice_expert_dim(gm, args[IDX_DOWN_BIAS], local_lo, local_hi) + args[IDX_DOWN_SCALES] = _slice_expert_dim(gm, args[IDX_DOWN_SCALES], local_lo, local_hi) + + args_ep = tuple(args) + (int(world_size), int(rank)) + node.target = torch.ops.auto_deploy.triton_mxfp4_moe_ep.default + node.args = args_ep + + # Add a dist all-reduce after the op (sum partial results across EP ranks) + with gm.graph.inserting_after(node): + red = gm.graph.call_function(torch.ops.auto_deploy.torch_dist_all_reduce, args=(node,)) + node.replace_all_uses_with(red) + # keep dataflow: red(input=node) + red.replace_input_with(red, node) + + +def _process_simple_shard( + nodes_linear: Dict[Node, List[Node]], transform_container: ShardingTransformContainer, layer_type: LayerType = LayerType.MLP, ) -> int: # for every linear node: # --> row_split (dim 0 of weight) + all_gather (dim -1 of output) # if nodes_linear is a dict, flatten it to a 1D list of nodes - + config = transform_container.config if isinstance(nodes_linear, dict): nodes_linear = [n for group in nodes_linear.values() for n in group] @@ -177,8 +1940,7 @@ def _process_simple_shard( WeightShardingInfo.from_node( n, split_dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, + config=config, dist_op="all_gather", min_local_shape=1, layer_type=layer_type, @@ -188,91 +1950,10 @@ def _process_simple_shard( return num_simple_shards -@TransformRegistry.register("detect_sharding") -class Sharding(BaseTransform): - """A transformation to apply sharding to the model following tensor parallelism. - - The transformation is based on the following steps: - - 1. Identify boundary nodes between residual nodes to identify shardable regions. - 2. Identify the GEMM nodes that can be sharded - 3. Trace through the subgraph using DFS/BFS between each pair of boundary nodes - 4. Account for each node in the trace to ensure the op is correct even after sharding. This is - necessary to ensure that the sharding is correct and we need to be able to account for - **all** nodes in the subgraph. The subgraph here is defined as the region between the first - linear node to the last linear node of an identified sharding region. - # 5. Shard the GEMM nodes or skip accordingly. - - min_local_shape is the minimum size of the local tensor shard, to prevent TP parallelism - splitting, e.g., the individual heads into smaller shards. - """ - - config: ShardingTransformConfig - - @classmethod - def get_config_class(cls) -> Type[TransformConfig]: - return ShardingTransformConfig - - def _apply( - self, - gm: GraphModule, - cm: CachedSequenceInterface, - factory: ModelFactory, - shared_config: SharedConfig, - ) -> Tuple[GraphModule, TransformInfo]: - local_rank, world_size = shared_config.local_rank, shared_config.world_size - if world_size < 2: - ad_logger.info("Skipping sharding for single device") - return gm, TransformInfo( - skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True - ) - assert isinstance(gm, GraphModule), "Expecting GraphModule" - self.config.factory_config = factory.get_sharding_config() if factory else {} - transform_container = shared_config.sharding_transform_container - transform_container.init_params(self.config, local_rank, world_size) - - info = TransformInfo(skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True) - for source in transform_container.sharding_source: - if source == ShardingSource.FACTORY: - if len(transform_container.get_factory_config()) == 0: - ad_logger.debug( - "No factory config found. Skipping sharding from factory config" - ) - continue - ad_logger.info("Applying sharding from factory config") - info += detect_sharding_from_config(gm, transform_container, ShardingSource.FACTORY) - elif source == ShardingSource.MANUAL: - if len(transform_container.get_manual_config()) == 0: - ad_logger.debug("No manual config found. Skipping sharding from manual config") - continue - ad_logger.info("Applying sharding from manual config") - info += detect_sharding_from_config(gm, transform_container, ShardingSource.MANUAL) - - elif source == ShardingSource.HEURISTIC: - ad_logger.info( - f"Running autodeploy sharding heuristics: {transform_container.sharding_dims}" - ) - # run TP sharding across ranks - if ShardingDim.TP in transform_container.sharding_dims: - info += detect_column_row_shard(gm, transform_container) - - # run EP sharding across ranks - if ShardingDim.EP in transform_container.sharding_dims: - info += detect_ep_shard(gm, transform_container) - - # run BMM sharding across ranks - if ShardingDim.BMM in transform_container.sharding_dims: - info += detect_dp_bmm_shard(gm, transform_container) - - return gm, info - - def _process_ssm_sharding( gm: GraphModule, entry_node: Node, transform_container: ShardingTransformContainer, - rank: int, - world_size: int, min_local_shape: int = 1, ) -> int: """ @@ -284,7 +1965,8 @@ def _process_ssm_sharding( except RuntimeError: ad_logger.warning("Could not find next linear node after entry_node for Mamba sharding") return 0 - + config = transform_container.config + world_size = config.world_size # Get subgraph between entry_node and next linear node subgraph_nodes = subgraph([entry_node], [out_proj_node]) @@ -323,8 +2005,7 @@ def _process_ssm_sharding( WeightShardingInfo.from_node( entry_node, split_dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, + config=config, dist_op=None, min_local_shape=min_local_shape, fused_weight_dims=fused_weight_dims["in_proj"], @@ -343,16 +2024,14 @@ def _process_ssm_sharding( split_args_1[1] = [s // world_size for s in split_args_1[1]] transform_container.add( ParameterUpdateInfo( - rank=rank, - world_size=world_size, + config=config, target_node=split_nodes[0].name, args=tuple(split_args_0), ) ) transform_container.add( ParameterUpdateInfo( - rank=rank, - world_size=world_size, + config=config, target_node=split_nodes[1].name, args=tuple(split_args_1), ) @@ -372,7 +2051,7 @@ def _process_ssm_sharding( conv_args[-1] = conv1d_node.args[-1] // world_size transform_container.add( ParameterUpdateInfo( - rank=rank, world_size=world_size, target_node=conv1d_node.name, args=tuple(conv_args) + config=transform_container.config, target_node=conv1d_node.name, args=tuple(conv_args) ) ) @@ -406,8 +2085,7 @@ def _process_ssm_sharding( WeightShardingInfo.from_node( list(weight_node.users)[0], split_dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, + config=config, dist_op=None, min_local_shape=min_local_shape, fused_weight_dims=fused_dims, @@ -433,7 +2111,7 @@ def _process_ssm_sharding( args[1] = tuple(view_shape) transform_container.add( ParameterUpdateInfo( - rank=rank, world_size=world_size, target_node=view_node.name, args=tuple(args) + config=transform_container.config, target_node=view_node.name, args=tuple(args) ) ) ad_logger.debug(f"\nUpdated view node {view_node} arguments to {view_node.args}") @@ -445,8 +2123,7 @@ def _process_ssm_sharding( WeightShardingInfo.from_node( out_proj_node, split_dim=SplitDimension.ROW, - rank=rank, - world_size=world_size, + config=transform_container.config, dist_op="all_reduce", layer_type=LayerType.MAMBA, ) @@ -458,13 +2135,13 @@ def _process_column_sharding( linear_nodes: List[Node], subgraph_nodes: Union[List[Node], None], transform_container: ShardingTransformContainer, - rank: int, - world_size: int, min_local_shape: int = 1, ) -> None: """ Parse the column sharding from the candidate nodes and update the view and split nodes accordingly. """ + config = transform_container.config + world_size = config.world_size if subgraph_nodes is None: subgraph_nodes = subgraph(linear_nodes, boundary_condition=is_any_lin_op) fused_weight_dims = None @@ -511,8 +2188,7 @@ def _process_column_sharding( WeightShardingInfo.from_node( linear_node, split_dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, + config=config, dist_op=None, # for column sharding, no dist op is performed min_local_shape=min_local_shape, fused_weight_dims=fused_weight_dims, @@ -537,9 +2213,7 @@ def _process_column_sharding( view_shape[2] = -1 args[1] = tuple(view_shape) transform_container.add( - ParameterUpdateInfo( - rank=rank, world_size=world_size, target_node=view_node.name, args=tuple(args) - ) + ParameterUpdateInfo(target_node=view_node.name, config=config, args=tuple(args)) ) ad_logger.debug(f"\nUpdated view node {view_node} arguments to {view_node.args}") @@ -561,9 +2235,7 @@ def _process_column_sharding( args = list(user.args) args[1] = new_sizes transform_container.add( - ParameterUpdateInfo( - rank=rank, world_size=world_size, target_node=user.name, args=tuple(args) - ) + ParameterUpdateInfo(config=config, target_node=user.name, args=tuple(args)) ) elif len(slice_nodes) > 0: for slice_node in filtered_nodes(linear_node.users, ops=torch.ops.aten.slice): @@ -572,8 +2244,7 @@ def _process_column_sharding( args[3] = args[3] // world_size transform_container.add( ParameterUpdateInfo( - rank=rank, - world_size=world_size, + config=config, target_node=slice_node.name, args=tuple(args), ) @@ -581,6 +2252,11 @@ def _process_column_sharding( # chunk nodes do not need to be updated +######################################################## +# Topological pattern matching functions +######################################################## + + def detect_sharding_from_config( gm: GraphModule, transform_container: ShardingTransformContainer, @@ -591,8 +2267,10 @@ def detect_sharding_from_config( TODO: currently, it applies only to TP sharding. Args: gm: Graph module to apply transformations to - transform_container: containing predefined sharding configuration + transform_container: Container for sharding transformations + source: Sharding source """ + config = transform_container.config # check if config is valid. # 1. it is a Dict[str, str] # 2. the keys are of format "module.submodule.subsubmodule..." @@ -609,17 +2287,15 @@ def detect_sharding_from_config( # The following constraints are based on # https://github.com/huggingface/transformers/blob/d8e05951b8efd4880acca9a3f291e8b65841a86d/src/transformers/models/llama4/configuration_llama4.py#L249 if source == ShardingSource.FACTORY: - config = transform_container.get_factory_config() + config = transform_container.config.factory_config elif source == ShardingSource.MANUAL: - config = transform_container.get_manual_config() + config = transform_container.config.manual_config else: raise ValueError(f"Unsupported sharding source: {source}") head_dim = config["head_dim"] tp_plan = config["tp_plan"] - rank, world_size = transform_container.rank, transform_container.world_size - # If the node is inside the attention module, we need to set min_local_shape to the # head_dim - otherwise, we would risk splitting the heads into smaller shards. # TODO: is there a better way to check if we are in attention module? @@ -661,14 +2337,13 @@ def detect_sharding_from_config( pattern_regex = re.escape(pattern_string).replace("@", ".*") if re.match(pattern_regex, module_name): # we have a match. Get the config for this layer + config = tp_plan[key] if config == "colwise": _process_column_sharding( linear_nodes=[lin_node], subgraph_nodes=None, transform_container=transform_container, - rank=rank, - world_size=world_size, min_local_shape=min_local_shape, ) elif config == "rowwise": @@ -676,8 +2351,7 @@ def detect_sharding_from_config( WeightShardingInfo.from_node( lin_node, split_dim=SplitDimension.ROW, - rank=rank, - world_size=world_size, + config=transform_container.config, dist_op="all_reduce", min_local_shape=min_local_shape, layer_type=layer_type, @@ -687,10 +2361,7 @@ def detect_sharding_from_config( num_attention_shards += 1 num_row_col_shards += 1 elif config == "mamba": - if ( - _process_ssm_sharding(gm, lin_node, transform_container, rank, world_size) - > 0 - ): + if _process_ssm_sharding(gm, lin_node, transform_container) > 0: num_ssm_shards += 1 num_row_col_shards += 1 @@ -707,8 +2378,7 @@ def detect_sharding_from_config( WeightShardingInfo.from_node( lin_node, split_dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, + config=transform_container.config, dist_op=None, min_local_shape=min_local_shape, layer_type=layer_type, @@ -719,8 +2389,7 @@ def detect_sharding_from_config( WeightShardingInfo.from_node( lin_node, split_dim=SplitDimension.ROW, - rank=rank, - world_size=world_size, + config=transform_container.config, dist_op="all_reduce", min_local_shape=min_local_shape, layer_type=layer_type, @@ -739,8 +2408,7 @@ def detect_sharding_from_config( WeightShardingInfo.from_node( lin_node, split_dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, + config=transform_container.config, dist_op="all_gather", min_local_shape=1, layer_type=layer_type, @@ -781,7 +2449,8 @@ def detect_ssm_shard( The goal is to have a unified single pass over the graph to detect layers and apply appropriate sharding transformations. """ - rank, world_size = transform_container.rank, transform_container.world_size + config = transform_container.config + world_size = config.world_size if world_size < 2: ad_logger.info("Skipping TP sharding for single device") return TransformInfo(skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True) @@ -797,7 +2466,7 @@ def detect_ssm_shard( in_proj_node, _ = bfs(ssm_node, is_any_lin_op, attr_next="args", include_root=False) num_ssm_shards += int( - _process_ssm_sharding(gm, in_proj_node, transform_container, rank, world_size) + _process_ssm_sharding(gm, in_proj_node, transform_container, config=config) ) ad_logger.info(f"Found {num_ssm_shards} SSM shards") @@ -808,7 +2477,7 @@ def detect_ssm_shard( def detect_column_row_shard( gm: GraphModule, - transfrom_container: ShardingTransformContainer, + transform_container: ShardingTransformContainer, ) -> TransformInfo: """A transformation to apply sharding to the model following tensor parallelism. @@ -827,7 +2496,8 @@ def detect_column_row_shard( splitting, e.g., the individual heads into smaller shards. """ ad_logger.debug("Before sharding graph: " + str(gm)) - rank, world_size = transfrom_container.rank, transfrom_container.world_size + config = transform_container.config + world_size = config.world_size assert isinstance(gm, GraphModule), "Expecting GraphModule" ad_logger.info("Running TP sharding detection") @@ -845,7 +2515,6 @@ def detect_column_row_shard( num_column_row_shards = 0 for opening, layer_subgraph, closing in layer_subgraphs: nodes_linear = opening + [closing] - num_shards += 1 ssm_nodes = list(filtered_nodes(layer_subgraph, is_any_ssm_op)) attention_nodes = list(filtered_nodes(layer_subgraph, is_any_attention_op)) @@ -858,12 +2527,12 @@ def detect_column_row_shard( else LayerType.MLP ) - if transfrom_container.simple_shard_only: + if config.simple_shard_only: ad_logger.debug( f"Forcing Simple Shard on nodes: {nodes_linear} with layer type: {layer_type}" ) num_simple_shards += _process_simple_shard( - nodes_linear, rank, world_size, transfrom_container, layer_type=layer_type + nodes_linear, transform_container, layer_type=layer_type ) continue @@ -873,7 +2542,9 @@ def detect_column_row_shard( assert len(opening) == 1, "Expected exactly one opening node in Mamba layer" ad_logger.debug(f"Found SSM nodes in layer subgraph: {ssm_nodes}") num_ssm_shards += _process_ssm_sharding( - gm, opening[0], transfrom_container, rank, world_size + gm, + opening[0], + transform_container, ) continue @@ -884,7 +2555,7 @@ def detect_column_row_shard( # only one attention operation. Fall back to simple shard. ad_logger.debug(f"More than one attention node: {attention_nodes}") num_simple_shards += _process_simple_shard( - nodes_linear, rank, world_size, transfrom_container, layer_type=layer_type + nodes_linear, transform_container, layer_type=layer_type ) continue # Extract head dimension. We cannot shard below the head_dim size. @@ -907,9 +2578,7 @@ def detect_column_row_shard( ) num_simple_shards += _process_simple_shard( nodes_linear, - rank, - world_size, - transfrom_container, + transform_container, layer_type=layer_type, ) # TODO: handle the case where num_kv_heads is not divisible by world_size @@ -919,19 +2588,16 @@ def detect_column_row_shard( _process_column_sharding( linear_nodes=opening, subgraph_nodes=layer_subgraph, - transform_container=transfrom_container, - rank=rank, - world_size=world_size, + transform_container=transform_container, min_local_shape=min_local_shape, ) # shard single row node - if transfrom_container.add( + if transform_container.add( WeightShardingInfo.from_node( closing, split_dim=SplitDimension.ROW, - rank=rank, - world_size=world_size, + config=config, dist_op="all_reduce", min_local_shape=min_local_shape, layer_type=layer_type, @@ -942,10 +2608,9 @@ def detect_column_row_shard( num_attention_shards += 1 # simple shard remaining linear nodes - num_simple_shards += _process_simple_shard( - unprocessed_linear_nodes, rank, world_size, transfrom_container - ) + num_simple_shards += _process_simple_shard(unprocessed_linear_nodes, transform_container) num_column_row_shards += num_ssm_shards + num_shards = num_simple_shards + num_column_row_shards ad_logger.info( f"Heuristics found {num_shards} TP shards. Simple: {num_simple_shards}, " f"row-col: {num_column_row_shards} (including: ssm: {num_ssm_shards}, attention: {num_attention_shards})" @@ -967,7 +2632,8 @@ def detect_dp_bmm_shard( We'll also assume that the inputs to BMM are broadcasted across the devices already. """ ad_logger.debug("Before sharding graph: " + str(gm)) - rank, world_size = transform_container.rank, transform_container.world_size + config = transform_container.config + rank, world_size = config.rank, config.world_size if world_size < 2: ad_logger.info("Skipping DP BMM sharding for single device") return TransformInfo(skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True) @@ -1016,11 +2682,10 @@ def detect_dp_bmm_shard( transform_container.add( BMMShardingInfo( - target_node=node.name, - rank=rank, - world_size=world_size, start_idx=start_idx, end_idx=end_idx, + target_node=node.name, + config=config, ) ) ad_logger.debug( @@ -1042,7 +2707,8 @@ def detect_ep_shard( ) -> TransformInfo: ad_logger.debug("Before sharding graph: " + str(gm)) - rank, world_size = transform_container.rank, transform_container.world_size + config = transform_container.config + world_size = config.world_size if world_size < 2: ad_logger.info("Skipping EP sharding for single device") return TransformInfo(skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True) @@ -1052,11 +2718,19 @@ def detect_ep_shard( for node in list(gm.graph.nodes): if not is_any_moe_op(node): continue + args = _canonicalize_node_args(node) + if isinstance(args[3], Node): + mlp_type = MLPType.FUSED_GATED_MLP + else: + if len(args[5]) > 0: + mlp_type = MLPType.GATED_MLP + else: + mlp_type = MLPType.MLP if transform_container.add( EPShardingInfo.from_node( node, - rank=rank, - world_size=world_size, + config=config, + mlp_type=mlp_type, ) ): num_moe_patterns += 1 diff --git a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py index fa91a79257..d27cc27f79 100644 --- a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py +++ b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py @@ -145,9 +145,7 @@ def extract_weight_node(node: Node) -> int: # for modelopt quantized graph, there will be a quantize_op _, weight_params, _ = get_quantization_params_from_linear_node(node) weight_node = weight_params.input_node if weight_params else weight_node - assert weight_node is not None, ( - "Expected exactly at least one weight node in the parametrized node" - ) + assert weight_node is not None, "Expected at least one weight node in the parametrized node" return find_get_attr_node(weight_node) diff --git a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py deleted file mode 100644 index c985cfdac6..0000000000 --- a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py +++ /dev/null @@ -1,1840 +0,0 @@ -"""Sharding config definitions for the inference optimizer.""" - -import math -import operator -import re -from abc import ABC, abstractmethod -from enum import Enum, IntEnum -from functools import partial -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Literal, - Optional, - Sequence, - Tuple, - Union, -) - -import torch -import torch.nn as nn -from pydantic import BaseModel, ConfigDict, Field, field_validator -from torch.fx import GraphModule, Node - -from ....functional import AllReduceStrategy -from ..models.factory import ShardingConfigSource -from ..utils.logger import ad_logger -from .node_utils import ( - bfs, - extract_param_names_from_node, - is_any_lin_op, - is_op, - num_users_of_weight_node, - subgraph, -) -from .quantization_utils import ( - cutlass_fp4_scale_to_modelopt_fp4_scale, - modelopt_fp4_scale_to_cutlass_fp4_scale, -) - -if TYPE_CHECKING: - from ..transform.library.sharding import ShardingTransformConfig - - -def validate_allreduce_strategy(v): - """Convert string names like 'AUTO' to AllReduceStrategy enum. - - This is a shared validator for allreduce_strategy fields across all config classes. - - Args: - v: Value to validate - can be AllReduceStrategy enum, string name, or integer value - - Returns: - AllReduceStrategy enum value - - Raises: - ValueError: If the input is an invalid strategy string - """ - if isinstance(v, AllReduceStrategy): - return v - if isinstance(v, str): - # Try to get enum by name - try: - return AllReduceStrategy[v] - except KeyError: - raise ValueError( - f"Invalid allreduce strategy: {v}. " - f"Valid options: {', '.join(s.name for s in AllReduceStrategy)}" - ) - if isinstance(v, int): - return AllReduceStrategy(v) - return v # Let Pydantic handle other types - - -def _get_dist_ops(backend: str): - """Get the appropriate distributed ops based on backend availability. - - Args: - backend: The distributed backend to use. Can be 'auto', 'trtllm', or 'torch'. - 'auto' will automatically select based on availability. - - Returns tuple of (all_gather_op, all_reduce_op) for the current backend. - """ - from ..custom_ops.trtllm_dist import is_trtllm_op_available - - # Handle DistBackend enum or string - if hasattr(backend, "value"): - backend = backend.value - - if backend == "trtllm": - # Force TRT-LLM ops - return ( - torch.ops.auto_deploy.trtllm_dist_all_gather.default, - torch.ops.auto_deploy.trtllm_dist_all_reduce.default, - ) - elif backend == "torch": - # Force PyTorch distributed ops - return ( - torch.ops.auto_deploy.torch_dist_all_gather.default, - torch.ops.auto_deploy.torch_dist_all_reduce.default, - ) - else: # auto - # Automatically select based on availability - if is_trtllm_op_available(): - # Use TRT-LLM optimized ops in MPI mode - return ( - torch.ops.auto_deploy.trtllm_dist_all_gather.default, - torch.ops.auto_deploy.trtllm_dist_all_reduce.default, - ) - else: - # Use PyTorch distributed ops in demollm mode - return ( - torch.ops.auto_deploy.torch_dist_all_gather.default, - torch.ops.auto_deploy.torch_dist_all_reduce.default, - ) - - -def _load_hook( - state_dict, - prefix, - *args, - f_split: Callable[[torch.Tensor, int], torch.Tensor], - param_key: str, - param_shape: torch.Size, -): - # TODO: we need to support loading either a sharded or unsharded checkpoint. - # Otherwise, basic workflows like - # model.load_state_dict(model.state_dict()) will fail. - # This is quite a hacky solution. A better solution would be to store extra_state in - # the state_dict to identify whether the state_dict is sharded or not. - key = prefix + param_key - ad_logger.debug(f"Sharder LOAD hook is called for '{key}'") - if key not in state_dict: - return - p_to_load = state_dict[key] - - p_to_load = p_to_load if param_shape == p_to_load.shape else f_split(p_to_load) - - state_dict[key] = p_to_load - - -def _load_hook_remove( - state_dict: Dict, - prefix: str, - *args, - param_key: str, -): - key = prefix + param_key - ad_logger.debug(f"Sharder LOAD hook is called for '{key}'") - state_dict.pop(key, None) - - -def _validate_sharded_shapes( - node: Node, fused_weight_dims: Optional[list] = None, world_size: Optional[int] = None -) -> None: - """ - Update the shapes of the view nodes and the split node parameters to account for the TP sharding. - 1. After sharding weights of the linear node using column split - in attention module (Q, K, V), - the output Y = X @ W^T shape is [batch, seq, num_heads // TP_size, head_dim]. - Some models hardcode the shape of the output to [batch, seq, num_heads, head_dim] - instead of implicit [batch, seq, -1, head_dim]. - Detect such cases and update the shape of the view node accordingly. - 2. If the weights are fused (e.g,. QKV, gate_up, SSM, etc.), the follow-up split node parameters - need to be updated to account for the TP sharding. - """ - - # get the subgraph of this module. Subgraph boundary is the next linear node. - next_lin_node, _ = bfs(node, is_any_lin_op, include_root=False) - nodes_to_validate = subgraph( - [node], - include=lambda n: is_op(n, [torch.ops.aten.view, torch.ops.aten.reshape]), - boundary_condition=is_any_lin_op, - ) - for shape_node in nodes_to_validate: - # Parameter update must be idempotent - if "sharded" in shape_node.meta and shape_node.meta["sharded"]: - continue - if len(shape_node.args) < 2: - continue - view_shape = list(shape_node.args[1]) - if not isinstance(view_shape, list): - continue - if len(view_shape) >= 3 and isinstance(view_shape[2], int) and view_shape[2] != -1: - args = list(shape_node.args) - view_shape[2] = -1 # view_shape[2] // world_size - args[1] = tuple(view_shape) - shape_node.args = tuple(args) - shape_node.meta["sharded"] = True - ad_logger.debug(f"\nUpdated view node {shape_node} arguments to {shape_node.args}") - - # if fused_weight_dims is provided, we need to update all split sizes - if fused_weight_dims is not None: - assert world_size is not None, "World size is required to update the split node params" - assert len(node.users) == 1, "Fused linear node should have only one user: a split node" - # find all split nodes in the region between this linear node and the next - split_nodes = subgraph( - [node], - [next_lin_node], - include=lambda n: is_op(n, [torch.ops.aten.split_with_sizes]), - ) - for split_node in split_nodes: - # Parameter update must be idempotent - if "sharded" in split_node.meta and split_node.meta["sharded"]: - continue - orig_sizes = split_node.args[1] - new_sizes = [orig_sizes[i] // world_size for i in range(len(orig_sizes))] - args = list(split_node.args) - args[1] = new_sizes - split_node.args = tuple(args) - split_node.meta["sharded"] = True - ad_logger.debug(f"\nUpdated split node {split_node} arguments to {split_node.args}") - - -def shard_weight_tensor( - gm: GraphModule, - weight_tensor: torch.Tensor, - param_key: str, - dim: int, - rank: int, - world_size: int, - min_local_shape: int = 1, - fused_weight_dims: Optional[list] = None, - requires_grad: bool = False, - update_param: bool = True, - custom_shard_fn: Optional[Callable[[torch.Tensor], torch.Tensor]] = None, -) -> Tuple[torch.Tensor, torch.Size]: - """Shard a weight tensor across ranks and register load hook. - - Args: - gm: GraphModule containing the weight - weight_tensor: The weight tensor to shard - param_key: Parameter key for registering load hook - dim: Dimension to shard along - rank: Current rank - world_size: Total number of ranks - min_local_shape: Minimum local shape constraint (for GQA) - fused_weight_dims: List of dimensions for fused weights - custom_shard_fn: Optional custom function to shard the tensor - requires_grad: Whether the parameter should require gradients - update_param: Whether to update the parameter in the module - - Returns: - Tuple of (sharded_tensor, sharded_shape) - """ - - def split_tensor( - t: torch.Tensor, - d: int = dim, - r: int = rank, - ws: int = world_size, - min_d_shape: int = min_local_shape, - ) -> torch.Tensor: - # The local tensor shape has to be divisible by min_d_shape - max_split_size = t.shape[d] // min_d_shape - if ws > max_split_size: - num_groups = math.ceil(ws / max_split_size) - ad_logger.debug( - f"World size {ws} is greater than the max split size {max_split_size}. " - + f"Splitting tensor to {num_groups} chunks" - ) - return torch.tensor_split(t, max_split_size, dim=d)[r // num_groups] - return torch.tensor_split(t, ws, dim=d)[r] - - # Handle fused weights - if fused_weight_dims is not None: - - def split_fused_tensor( - t: torch.Tensor, - fused_dims: list = fused_weight_dims, - d: int = dim, - ) -> torch.Tensor: - return torch.cat( - [split_tensor(w) for w in torch.split(t, fused_dims, dim=d)], - dim=d, - ) - - f_split = split_fused_tensor - else: - f_split = split_tensor - - sharded_weight = f_split(weight_tensor) - sharded_shape = sharded_weight.shape - - # Register load hook - gm._register_load_state_dict_pre_hook( - partial( - _load_hook, - f_split=f_split, - param_key=param_key, - param_shape=sharded_shape, - ) - ) - - # Update the parameter in the module - if update_param: - modname, _, param_name = param_key.rpartition(".") - submod = gm.get_submodule(modname) - param_new = nn.Parameter(sharded_weight.detach().clone(), requires_grad=requires_grad) - setattr(submod, param_name, param_new) - - return sharded_weight, sharded_shape - - -def get_all_weights_in_subgraph( - sources: list[Node], - sinks: list[Node], -): - """Get all weight nodes (get_attr nodes) in the subgraph between sources and sinks.""" - weight_nodes = subgraph(sources, sinks, include=lambda n: n.op == "get_attr") - return weight_nodes - - -def _insert_sharded_mamba( - gm: GraphModule, - entry_node: Node, - dim: int, - rank: int, - world_size: int, - allreduce_strategy: AllReduceStrategy, - dist_backend: str, - add_dist: bool = False, - min_local_shape: int = 1, - weights_to_shard: Optional[list[str]] = None, - weight_shard_dims: Optional[Dict[str, int]] = None, - fused_weight_dims: Optional[Dict[str, list]] = None, - quantization_cb: Optional[ - Callable[[GraphModule, nn.Module, Node, str, torch.Size, int, int, int], None] - ] = None, -) -> bool: - """ - To shard Mamba layer, first column-shard the first linear layer: entry_node, - - NOTE: allreduce_strategy is MANDATORY and must be explicitly provided. - then shard all remaining weight tensors found in the subgraph defined between - entry_node and the next successor linear node. - First, validate if this is indeed a mamba module: within the subgraph, - there should be an torch_ssm node and conv1d node. - - Args: - gm: GraphModule - entry_node: The first linear node of the Mamba layer - dim: Default shard dimension - rank: Current rank - world_size: Total number of ranks - add_dist: Whether to add distribution op after entry_node - min_local_shape: Minimum local shape constraint - weights_to_shard: Optional list of regex patterns to match weight names - weight_shard_dims: Optional dict mapping weight keys to their shard dimensions - fused_weight_dims: Optional dict mapping weight keys to their fused dimension lists - quantization_cb: Optional quantization callback - """ - if allreduce_strategy is None: - raise ValueError( - f"allreduce_strategy must be set for Mamba sharding on node {entry_node.name}" - ) - # Find next linear node to define subgraph boundary - try: - next_lin_node, depth = bfs(entry_node, is_any_lin_op, include_root=False) - except RuntimeError: - ad_logger.warning("Could not find next linear node after entry_node for Mamba sharding") - return False - - # Get subgraph between entry_node and next linear node - subgraph_nodes = subgraph([entry_node], [next_lin_node]) - - ############################################################## - ########## validate if this is a valid Mamba module ########## - ############################################################## - # has_ssm = any(is_op(n, torch.ops.auto_deploy.mamba.torch_ssm_transform) for n in subgraph_nodes) - has_ssm = True - conv1d_nodes = [ - n - for n in subgraph_nodes - if is_op(n, [torch.ops.aten.conv1d, torch.ops.auto_deploy.torch_causal_conv1d]) - ] - if len(conv1d_nodes) != 1 or not has_ssm: - ad_logger.warning( - f"Subgraph does not contain exactly one conv1d node and torch_ssm_transform. " - f"Skipping Mamba sharding. conv1d_nodes={conv1d_nodes}, has_ssm={has_ssm}" - ) - return False - - ############################################################## - ########## infer split sizes for in_proj and conv1d ########## - ############################################################## - # in_proj and conv1d are most likely fused, followed up by split nodes. Infer split sizes: - if fused_weight_dims is None: - split_nodes = [ - n - for n in subgraph_nodes - if is_op(n, [torch.ops.aten.split, torch.ops.aten.split_with_sizes]) - ] - if len(split_nodes) != 2: - ad_logger.warning( - f"Subgraph does not contain exactly two split nodes. " - f"Skipping Mamba sharding. split_nodes={split_nodes}" - ) - return False - split_sizes_1 = split_nodes[0].args[1] - split_sizes_2 = split_nodes[1].args[1] - if split_sizes_1[1] != sum(split_sizes_2): - ad_logger.warning( - f"Split nodes have different sizes. " - f"Skipping Mamba sharding. split_sizes_1={split_sizes_1}, split_sizes_2={split_sizes_2}" - ) - return False - fused_weight_dims = { - "in_proj": split_sizes_1[0:1] + split_sizes_2 + split_sizes_1[2:], - "conv1d": split_sizes_2, - } - - conv1d_node = conv1d_nodes[0] - # conv1d_node last argument is the number of output channels. - # This one is also sharded, so we need to update this parameter - conv_args = list(conv1d_node.args) - conv_args[-1] = conv1d_node.args[-1] // world_size - conv1d_node.args = tuple(conv_args) - - ############################################################## - ####### shard the entry_node (the first linear layer) ######## - ############################################################## - # Extract entry node's fused_weight_dims by matching weight name against patterns - entry_fused_dims = None - if fused_weight_dims: - entry_weight_key, _ = extract_param_names_from_node(entry_node) - for pattern, dims in fused_weight_dims.items(): - if re.search(pattern, entry_weight_key): - entry_fused_dims = dims - break - - _shard_parameter_node( - gm=gm, - node=entry_node, - dim=SplitDimension.COLUMN, - rank=rank, - world_size=world_size, - dist_backend=dist_backend, - add_dist=False, - min_local_shape=min_local_shape, - fused_weight_dims=entry_fused_dims, - quantization_cb=quantization_cb, - allreduce_strategy=allreduce_strategy, - ) - - ############################################################## - ######## Shard remaining weights: conv1d and RMSNorm ######### - ############################################################## - # Get all weight nodes in the subgraph except for out_proj - weight_nodes = [ - n - for n in get_all_weights_in_subgraph([entry_node], [next_lin_node]) - if "out_proj" not in str(n) - ] - - for weight_node in weight_nodes: - weight_key = weight_node.target - - # Filter by regex patterns if provided - if weights_to_shard is not None: - if not any(pattern in weight_key for pattern in weights_to_shard): - continue - - # Determine shard dimension for this weight - shard_dim = weight_shard_dims.get(weight_key, dim) if weight_shard_dims else dim - - # Get the weight parameter - try: - weight_param = gm.get_parameter(weight_key) - except AttributeError: - ad_logger.debug(f"Could not get parameter for {weight_key}, skipping") - continue - - # Get fused dims for this weight if specified - fused_dims = None - for k, v in fused_weight_dims.items(): - if k in weight_key: - fused_dims = v - break - - # Shard the weight tensor (also updates the parameter in the module) - _, sharded_shape = shard_weight_tensor( - gm=gm, - weight_tensor=weight_param, - param_key=weight_key, - dim=shard_dim, - rank=rank, - world_size=world_size, - min_local_shape=min_local_shape, - fused_weight_dims=fused_dims, - ) - - ad_logger.debug( - f"Sharded weight {weight_key} on dim {shard_dim}: " - f"{weight_param.shape} -> {sharded_shape}" - ) - - ############################################################## - ############## update split node parameters ################## - ############################################################## - next_lin_node, _ = bfs(entry_node, is_any_lin_op, include_root=False) - - split_nodes = subgraph( - [entry_node], - [next_lin_node], - include=lambda n: is_op(n, [torch.ops.aten.split_with_sizes]), - ) - for split_node in split_nodes: - orig_sizes = split_node.args[1] - new_sizes = [orig_sizes[i] // world_size for i in range(len(orig_sizes))] - args = list(split_node.args) - args[1] = new_sizes - split_node.args = tuple(args) - ad_logger.debug(f"\nUpdated split node {split_node} arguments to {split_node.args}") - - nodes_to_validate = subgraph( - [entry_node], - include=lambda n: is_op(n, [torch.ops.aten.view, torch.ops.aten.reshape]), - boundary_condition=is_any_lin_op, - ) - for reshape_node in nodes_to_validate: - if len(reshape_node.args) < 2: - continue - if "sharded" in reshape_node.meta and reshape_node.meta["sharded"]: - continue - view_shape = list(reshape_node.args[1]) - if not isinstance(view_shape, list): - continue - if len(view_shape) >= 3 and isinstance(view_shape[2], int) and view_shape[2] != -1: - args = list(reshape_node.args) - view_shape[2] = -1 # view_shape[2] // world_size - args[1] = tuple(view_shape) - reshape_node.args = tuple(args) - reshape_node.meta["sharded"] = True - ad_logger.debug(f"\nUpdated view node {reshape_node} arguments to {reshape_node.args}") - - -def _shard_parameter_node( - gm: GraphModule, - node: Node, - dim: int, - rank: int, - world_size: int, - allreduce_strategy: AllReduceStrategy, - dist_backend: str, - add_dist: bool = False, - min_local_shape: int = 1, - fused_weight_dims: Optional[list] = None, - quantization_cb: Optional[ - Callable[[GraphModule, nn.Module, Node, str, torch.Size, int, int, int], None] - ] = None, -) -> None: - """Replace the node with parametrized weight tensor with a new node that accepts sharded weights. - - NOTE: allreduce_strategy is MANDATORY and must be explicitly provided. - - The state_dict is also updated to contain the sharded weights. - """ - if allreduce_strategy is None: - raise ValueError( - f"allreduce_strategy must be set for parameter sharding on node {node.name}" - ) - assert dim in [0, 1], "Only dim 0 and 1 are supported for sharding" - assert add_dist or dim == 0, "For dim=1 sharding, dist_op is required." - - num_users = num_users_of_weight_node(node) - if num_users > 1 or num_users == 0: - ad_logger.warning( - f"Weight node {node} has {num_users} users. This is not supported for sharding. Skipping." - ) - return - # get weight and bias key - weight_key, bias_key = extract_param_names_from_node(node) - - modname = weight_key.rpartition(".")[0] - submod = gm.get_submodule(modname) - - # Shard weight using the unified function (also updates the parameter) - original_weight = gm.get_parameter(weight_key) - _, weight_new_shape = shard_weight_tensor( - gm=gm, - weight_tensor=original_weight, - param_key=weight_key, - dim=dim, - rank=rank, - world_size=world_size, - min_local_shape=min_local_shape, - fused_weight_dims=fused_weight_dims, - ) - - if bias_key is not None and dim == 0: - # update bias for dim 0 --> we can handle it like the weight - original_bias = gm.get_parameter(bias_key) - shard_weight_tensor( - gm=gm, - weight_tensor=original_bias, - param_key=bias_key, - dim=dim, - rank=rank, - world_size=world_size, - min_local_shape=min_local_shape, - fused_weight_dims=fused_weight_dims, - ) - elif bias_key is not None and rank != world_size - 1: - # update the bias for dim 1 --> in this case only the last rank gets the bias to avoid - # double counting it. For all other we will delete the bias. - args = list(node.args) - node_bias = args[2] - args[2] = None - node.args = tuple(args) - gm.graph.erase_node(node_bias) - bias_param_name = bias_key.rpartition(".")[-1] - setattr(submod, bias_param_name, None) - gm._register_load_state_dict_pre_hook(partial(_load_hook_remove, param_key=bias_key)) - - if quantization_cb is not None: - quantization_cb( - gm=gm, - submod=submod, - node=node, - weight_key=weight_key, - weight_new_shape=weight_new_shape, - dim=dim, - rank=rank, - world_size=world_size, - ) - - # # # column shard with no gather: the output is sharded - if not add_dist: - return - - # figure out the right dist op (backend-aware) - all_gather_op, all_reduce_op = _get_dist_ops(dist_backend) - dist_lookup = { - 0: (all_gather_op, -1), - 1: (all_reduce_op, allreduce_strategy.name), - } - fn_dist, *dist_args = dist_lookup[dim] - - # add reduction node - with gm.graph.inserting_after(node): - dist_node = gm.graph.call_function(fn_dist, args=(node,) + tuple(dist_args)) - node.replace_all_uses_with(dist_node) - dist_node.replace_input_with(dist_node, node) - - -def _update_node_args(node: Node, args: tuple) -> None: - """Update the node's arguments with the new sharded arguments.""" - if "sharded" in node.meta and node.meta["sharded"]: - return - node.args = args - node.meta["sharded"] = True - ad_logger.debug( - f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}." - ) - - -class SplitDimension(IntEnum): - """Enum for tensor split dimensions in sharding.""" - - # NOTE: The names COLUMN/ROW reflect the hugging face - # base_tp_plan sharding notation, but since we assume Y = W @ X^T, - # when splitting weight matrix W^T across columns, the actual split - # is over dimension 0 - COLUMN = 0 - ROW = 1 - - -class ShardingTransformInfo(BaseModel, ABC): - """Abstract base class for transformation configurations.""" - - model_config = ConfigDict(frozen=True) # Makes the model immutable and hashable - - target_node: str - rank: int - world_size: int - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - """ - Validate whether the transformation is valid. - Execute right before applying the transformation. - """ - return True - - @abstractmethod - def apply(self, gm: GraphModule, node: Node) -> None: - """Apply the transformation to the graph module. - - This method must be implemented by each transformation class. - """ - pass - - def check_and_apply(self, gm: GraphModule, node: Node) -> bool: - """ - Check if the transformation is valid and apply it if it is. - Return True if the transformation is applied, False otherwise. - """ - if not self.validate(gm, node): - ad_logger.warning(f"Skipping invalid transformation {self}.") - return False - self.apply(gm, node) - return True - - -class LayerType(Enum): - ATTENTION = "attention" - MAMBA = "mamba" - MLP = "mlp" - MOE = "moe" - - -class WeightShardingInfo(ShardingTransformInfo): - """Configuration for TP sharding transformations. - - NOTE: allreduce_strategy will be automatically injected by ShardingConfig.add() - if not provided at creation time. The strategy comes from the parent ShardingConfig. - """ - - split_dim: SplitDimension - dist_op: Optional[Literal["all_reduce", "all_gather"]] = None - min_local_shape: int = 1 - layer_type: LayerType = LayerType.MLP - # used for TP sharding of fused weights - fused_weight_dims: Optional[list] = None - allreduce_strategy: Optional[AllReduceStrategy] = None # Set by ShardingConfig.add() if None - dist_backend: Optional[str] = None # Set by ShardingConfig.add() if None - - def quantization_cb( - self, - gm: GraphModule, - submod: nn.Module, - node: Node, - weight_key: str, - weight_new_shape: torch.Size, - dim: int, - rank: int, - world_size: int, - ) -> None: - """Quantization callback. Default does nothing for non-quantized models.""" - return None - - @classmethod - def from_node(cls, node: Node, **kwargs) -> "WeightShardingInfo": - """ - Create the correct TPShardingInfo subclass (FP8/FP4/base) based on `node`. - """ - subcls = _resolve_tp_cls_from_node(node) - return subcls(target_node=node.name, **kwargs) - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - """Validate the transformation configuration.""" - if self.dist_op is not None: - if self.split_dim == SplitDimension.COLUMN: - if self.dist_op == "all_reduce": - ad_logger.warning( - f"Column split is only supported for all_gather. Skipping {self}." - ) - return False - if self.split_dim == SplitDimension.ROW: - if self.dist_op == "all_gather": - ad_logger.warning( - f"Row split is only supported for all_reduce. Skipping {self}." - ) - return False - return True - - def apply(self, gm: GraphModule, node: Node) -> None: - """Apply TP sharding transformation to the graph module.""" - _shard_parameter_node( - gm=gm, - node=node, - dim=self.split_dim.value, - rank=self.rank, - world_size=self.world_size, - add_dist=self.dist_op is not None, - dist_backend=self.dist_backend, - min_local_shape=self.min_local_shape, - fused_weight_dims=self.fused_weight_dims, - quantization_cb=self.quantization_cb, - allreduce_strategy=self.allreduce_strategy, - ) - - -class ParameterUpdateInfo(ShardingTransformInfo): - """Configuration for node args sharding transformations.""" - - args: tuple - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - """Validate the transformation configuration.""" - return len(node.args) == len(self.args) - - def apply(self, gm: GraphModule, node: Node) -> None: - """Apply the transformation to the graph module.""" - _update_node_args(node, self.args) - - -class QuantizationShardingMixin(ABC): - """ - Mixin that provides a callback to handle quantization-aware sharding: - - shards/rewrites scale buffers - - registers the quantized shard load hook - """ - - @abstractmethod - def scale_names(self) -> List[str]: ... - - def shard_scales( - self, - dim: int, - rank: int, - world_size: int, - weight_shape: torch.Size, - **scales: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - return {k: v for k, v in scales.items() if isinstance(v, torch.Tensor)} - - def shard_load_hook( - self, - state_dict, - prefix, - *args, - weight_name: str, - weight_shape: torch.Size, - dim: int, - rank: int, - world_size: int, - ) -> None: - return - - def quantization_cb( - self, - gm: GraphModule, - submod: nn.Module, - node: Node, - weight_key: str, - weight_new_shape: torch.Size, - dim: int, - rank: int, - world_size: int, - ) -> None: - scales = {} - for scale_name in self.scale_names(): - scales[scale_name] = submod.get_buffer(scale_name) - scales["weight_shape"] = weight_new_shape - sharded_scales = self.shard_scales(dim, rank, world_size, **scales) - for k, v in sharded_scales.items(): - submod.register_buffer(k, v) - - gm._register_load_state_dict_pre_hook( - partial( - self.shard_load_hook, - weight_name=weight_key, - weight_shape=weight_new_shape, - dim=dim, - rank=rank, - world_size=world_size, - ) - ) - - -class FP8TPShardingInfo(QuantizationShardingMixin, WeightShardingInfo): - """Tensor-parallel sharding for FP8-quantized linears.""" - - def scale_names(self) -> List[str]: - return ["input_scale", "weight_scale"] - - def shard_scales( - self, - dim: int, - rank: int, - world_size: int, - weight_shape: torch.Size, - *, - input_scale: torch.Tensor, - weight_scale: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - return { - "input_scale": input_scale, - "weight_scale": weight_scale, - } - - def shard_load_hook( - self, - state_dict, - prefix, - *args, - weight_name: str, - weight_shape: torch.Size, - dim: int, - rank: int, - world_size: int, - ) -> None: - return - - -def _shard_fp4_weight_scale(weight_scale, sharded_uint8_weight_shape, dim, rank, world_size): - assert weight_scale.dim() == 1 - weight_shape_original = list(sharded_uint8_weight_shape) - weight_shape_original[dim] = weight_shape_original[dim] * world_size - weight_shape_original[-1] *= 2 - modelopt_weight_scale = cutlass_fp4_scale_to_modelopt_fp4_scale( - weight_scale, tuple(weight_shape_original) - ) - return modelopt_fp4_scale_to_cutlass_fp4_scale( - modelopt_weight_scale.tensor_split(world_size, dim=dim)[rank] - ) - - -class FP4TPShardingInfo(QuantizationShardingMixin, WeightShardingInfo): - """Tensor-parallel sharding for FP4-quantized linears.""" - - def scale_names(self) -> List[str]: - return ["input_scale", "weight_scale", "alpha"] - - def shard_scales( - self, - dim: int, - rank: int, - world_size: int, - weight_shape: torch.Size, - *, - weight_scale: torch.Tensor, - alpha: torch.Tensor, - input_scale: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - return { - "alpha": alpha, - "input_scale": input_scale, - "weight_scale": _shard_fp4_weight_scale( - weight_scale, weight_shape, dim, rank, world_size - ), - } - - def shard_load_hook( - self, - state_dict, - prefix, - *args, - weight_name: str, - weight_shape: torch.Size, - dim: int, - rank: int, - world_size: int, - ) -> None: - key = weight_name + "_scale" - if key in state_dict: - state_dict[key] = _shard_fp4_weight_scale( - state_dict[key], weight_shape, dim, rank, world_size - ) - - -TP_SHARDING_RULES = [ - (lambda n: is_op(n, torch.ops.auto_deploy.torch_fake_quant_fp8_linear), FP8TPShardingInfo), - (lambda n: is_op(n, torch.ops.auto_deploy.torch_fake_quant_nvfp4_linear), FP4TPShardingInfo), -] - - -def _resolve_tp_cls_from_node(node: Node): - for pred, cls in TP_SHARDING_RULES: - try: - if pred(node): - return cls - except Exception: - pass - return WeightShardingInfo - - -class BMMShardingInfo(ShardingTransformInfo): - """Configuration for BMM sharding transformations.""" - - rank: int - world_size: int - start_idx: int - end_idx: int - dist_backend: Optional[str] = None # Set by ShardingConfig.add() if None - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - """Validate the transformation configuration.""" - if not is_op(node, torch.ops.aten.bmm): - ad_logger.warning(f"BMM sharding is only supported for BMM nodes. Skipping {self}.") - return False - - # Get the input tensors - lhs_tensor = node.args[0] - rhs_tensor = node.args[1] - - # Check batch sizes from meta information - lhs_batch_size = lhs_tensor.meta["val"].shape[0] - rhs_batch_size = rhs_tensor.meta["val"].shape[0] - - assert lhs_batch_size == rhs_batch_size, "Batch sizes of both tensors must match" - bmm_batch_size = lhs_batch_size - - # Check if the distribution is balanced - remainder = bmm_batch_size % self.world_size - - # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather/trtllm_dist_all_gather - # doesn't support uneven splits at the moment. - if remainder: - ad_logger.warning( - f"BMM batch size {bmm_batch_size} is not divisible by world size {self.world_size}. " - f"This will result in uneven distribution of work across devices. Skipping." - ) - return False - return True - - def apply(self, gm: GraphModule, node: Node) -> None: - """Apply BMM sharding transformation to the graph module.""" - - def handle_tensor( - bmm_node: Node, tensor_node: Node, arg_idx: int, start_idx: int, end_idx: int - ): - """Unified helper function to shard either a parameter tensor or a dynamic tensor. - - Args: - bmm_node: The BMM node that is being processed - tensor_node: The input tensor node to shard - arg_idx: The argument index of the tensor in the BMM node - start_idx: Start index for sharding - end_idx: End index for sharding - """ - - # Define slice function for the sharding - def slice_tensor(t: torch.Tensor) -> torch.Tensor: - return t[start_idx:end_idx] - - if tensor_node.op == "get_attr": - # Handle parameter tensor - weight_key = tensor_node.target - modname, _, param_name = weight_key.rpartition(".") - param = gm.get_parameter(weight_key) - - # Update the parameter with its shard - param_new = nn.Parameter(slice_tensor(param).detach().clone(), requires_grad=True) - gm.get_submodule(modname).register_parameter(param_name, param_new) - - # Register load state dict hook - gm._register_load_state_dict_pre_hook( - partial( - _load_hook, - f_split=slice_tensor, - param_key=weight_key, - param_shape=param_new.shape, - ) - ) - else: - # Handle dynamic tensor - with gm.graph.inserting_before(bmm_node): - tensor_slice = gm.graph.call_function( - torch.ops.aten.slice.Tensor, args=(tensor_node, 0, start_idx, end_idx, 1) - ) - # Update BMM node to use the sliced tensor - bmm_node.update_arg(arg_idx, tensor_slice) - - # Get the input tensors - lhs_tensor = node.args[0] - rhs_tensor = node.args[1] - # Handle both tensors - handle_tensor(node, lhs_tensor, 0, self.start_idx, self.end_idx) - handle_tensor(node, rhs_tensor, 1, self.start_idx, self.end_idx) - - # Add all_gather node after BMM to collect results - all_gather_op, _ = _get_dist_ops(self.dist_backend) - with gm.graph.inserting_after(node): - gather_node = gm.graph.call_function( - all_gather_op, - args=(node, 0), # Gather along batch dimension (0) - ) - node.replace_all_uses_with(gather_node) - gather_node.replace_input_with(gather_node, node) - - -def _insert_sharded_moe( - gm: GraphModule, - node: Node, - rank: int, - world_size: int, - allreduce_strategy: AllReduceStrategy, - dist_backend: str, - scale_names: Sequence[str] = (), -): - """Update the torch_moe node with sharded weight lists or stacked tensors, - sharded `selected_experts` and `final_scales(router_logics)`. - Add an all_reduce node after the moe node. - - Handles both: - - Standard format: per-expert weight lists - - Stacked format: single-element lists containing stacked 3D tensors (Llama4 pattern) - - NOTE: allreduce_strategy is MANDATORY. - """ - if allreduce_strategy is None: - raise ValueError(f"allreduce_strategy must be set for MoE sharding on node {node.name}") - scale_names = list(scale_names) - - # Detect format: check if w1_weight is a single-element list with a 3D tensor (stacked format) - w1_weight_arg = node.args[3] - is_stacked = False - - # In FX graphs, the list might be a Node representing a list() call - if isinstance(w1_weight_arg, Node): - # Check if this is a list() call node - if w1_weight_arg.target is list and len(w1_weight_arg.args) > 0: - # Get the actual list content from the args - list_content = w1_weight_arg.args[0] - if isinstance(list_content, (list, tuple)) and len(list_content) == 1: - first_elem = list_content[0] - if isinstance(first_elem, Node) and first_elem.op == "get_attr": - try: - tensor = gm.get_parameter(first_elem.target) - is_stacked = tensor.ndim == 3 - except (AttributeError, KeyError): - pass - elif isinstance(first_elem, torch.Tensor): - is_stacked = first_elem.ndim == 3 - # Handle case where it's a direct Python list (not in FX graph context) - elif isinstance(w1_weight_arg, (list, tuple)) and len(w1_weight_arg) == 1: - first_elem = w1_weight_arg[0] - if isinstance(first_elem, Node) and first_elem.op == "get_attr": - try: - tensor = gm.get_parameter(first_elem.target) - is_stacked = tensor.ndim == 3 - except (AttributeError, KeyError): - pass - elif isinstance(first_elem, torch.Tensor): - is_stacked = first_elem.ndim == 3 - - if is_stacked: - # Use stacked tensor sharding logic (similar to _insert_sharded_moe_bmm) - _insert_sharded_moe_stacked(gm, node, rank, world_size, allreduce_strategy, scale_names) - return - - # Standard per-expert list sharding - # For FX graphs, get the list from the Node; for direct calls, use the list directly - if isinstance(w1_weight_arg, Node) and w1_weight_arg.target is list: - # Extract the list content from the list() call node - num_experts = len(w1_weight_arg.args[0]) if w1_weight_arg.args else 0 - elif isinstance(w1_weight_arg, (list, tuple)): - num_experts = len(w1_weight_arg) - else: - raise ValueError(f"Unexpected w1_weight format in node {node.name}: {type(w1_weight_arg)}") - args = list(node.args) - - # -- Handle selected_experts and final_scales sharding -- - selected_experts = args[1] - final_scales = args[2] - - experts_per_rank = num_experts // world_size - - with gm.graph.inserting_before(node): - lower = experts_per_rank * rank - # selected_experts_local = selected_experts - low - selected_experts_local = gm.graph.create_node( - "call_function", operator.sub, args=(selected_experts, lower), kwargs={} - ) - - # For num_experts % world_size != 0 case, - # assign the last (num_experts % world_size) experts to the last rank - # if rank == world_size -1: - # rank_mask = (selected_experts // experts_per_rank) >= rank - # else: - # rank_mask = (selected_experts // experts_per_rank) == rank - div_node = gm.graph.create_node( - "call_function", operator.floordiv, args=(selected_experts, experts_per_rank), kwargs={} - ) - comp_op = torch.ge if rank == world_size - 1 else torch.eq - rank_mask = gm.graph.create_node("call_function", comp_op, args=(div_node, rank), kwargs={}) - - # final_scales_local = final_scales * rank_mask - final_scales_local = gm.graph.create_node( - "call_function", operator.mul, args=(final_scales, rank_mask), kwargs={} - ) - - # -- Shard expert weights -- - def get_partition(lst, world_size, rank): - num_experts = len(lst) - expert_size_per_partition = num_experts // world_size - expert_start = rank * expert_size_per_partition - # For num_experts % world_size != 0 case, - # assign the last (num_experts % world_size) experts to the last rank - expert_end = ( - num_experts if (rank == world_size - 1) else expert_start + expert_size_per_partition - ) - return lst[expert_start:expert_end] - - w1_list_sharded = get_partition(args[3], world_size, rank) - w2_list_sharded = get_partition(args[4], world_size, rank) - w3_list_sharded = get_partition(args[5], world_size, rank) - - # -- Update args -- - args[1] = selected_experts_local - args[2] = final_scales_local - args[3] = w1_list_sharded - args[4] = w2_list_sharded - args[5] = w3_list_sharded - - # Shard scales for quantized ops - for i in range(len(scale_names) * 3): # 3 layers (w1, w2, w3) × #scale_names per layer - args[6 + i] = get_partition(args[6 + i], world_size, rank) - - ad_logger.debug( - f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}." - ) - node.args = tuple(args) - - # -- add an all_reduce node -- - _, all_reduce_op = _get_dist_ops(dist_backend) - with gm.graph.inserting_after(node): - dist_node = gm.graph.call_function(all_reduce_op, args=(node, allreduce_strategy.name)) - node.replace_all_uses_with(dist_node) - dist_node.replace_input_with(dist_node, node) - - -def _slice_expert_dim( - gm: GraphModule, - tensor_node_or_tensor: Union[Node, torch.Tensor], - lo: int, - hi: int, -) -> Union[Node, torch.Tensor]: - """Slice expert weights along dim 0 and register load hook (simple version). - - This is the original simple slicing function used by MXFP4 EP sharding. - For parameters, it modifies them in-place and returns the same node. - - Args: - gm: The graph module - tensor_node_or_tensor: Either a Node (from FX graph) or a Tensor - lo: Start index for slicing - hi: End index for slicing - - Returns: - Node or Tensor depending on input type - """ - # Handle raw tensor case - if isinstance(tensor_node_or_tensor, torch.Tensor): - return tensor_node_or_tensor[lo:hi] - - # Handle Node case - tensor_node = tensor_node_or_tensor - - if tensor_node.op != "get_attr": - # If not a parameter node, just add a runtime slice node after it - with gm.graph.inserting_after(tensor_node): - return gm.graph.call_function( - torch.ops.aten.slice.Tensor, - args=(tensor_node, 0, lo, hi, 1), - ) - - # Get the parameter - param_key = str(tensor_node.target) - modname, _, param_name = param_key.rpartition(".") - submod = gm.get_submodule(modname) if modname else gm - full_param = getattr(submod, param_name) - - # Slice the parameter - sliced_param = full_param[lo:hi].detach().clone() - sliced_shape = sliced_param.shape - - # Define slice function for load hook - def slice_expert_tensor(t: torch.Tensor) -> torch.Tensor: - return t[lo:hi] - - # Register load hook to slice during checkpoint loading - gm._register_load_state_dict_pre_hook( - partial( - _load_hook, - f_split=slice_expert_tensor, - param_key=param_key, - param_shape=sliced_shape, - ) - ) - - # Replace the parameter with the sliced version - new_param = nn.Parameter(sliced_param, requires_grad=False) - setattr(submod, param_name, new_param) - - # Return the same node (it now points to the sliced parameter) - return tensor_node - - -def _transform_bmm_moe_weight_param( - gm: GraphModule, - param_node: Node, - lo: int, - hi: int, - swap_gate_up: bool = False, -) -> None: - """Transform a parameter for BMM MoE: slice experts, optionally swap gate/up, transpose. - - This modifies the parameter in-place and registers a load hook. - Does NOT create graph nodes - those should be created separately by the caller. - - Args: - gm: Graph module - param_node: The get_attr node for the parameter - lo: Start index for expert slicing - hi: End index for expert slicing - swap_gate_up: If True, swap W1 and W3 (Llama4 -> TRT-LLM format) - """ - if param_node.op != "get_attr": - return # Only works on parameters - - param_key = str(param_node.target) - modname, _, param_name = param_key.rpartition(".") - submod = gm.get_submodule(modname) if modname else gm - full_param = getattr(submod, param_name) - - # Slice the parameter along expert dimension (dim 0) - sliced_param = full_param[lo:hi].detach().clone() - - # Swap W1 and W3 if needed (for gate_up weights) - # Llama4: (E, H, 2*I) with [W1, W3], TRT-LLM wants [W3, W1] - if swap_gate_up and sliced_param.ndim == 3: - intermediate_size = sliced_param.shape[2] // 2 - w1 = sliced_param[:, :, :intermediate_size] - w3 = sliced_param[:, :, intermediate_size:] - sliced_param = torch.cat([w3, w1], dim=2) - - # Transpose: Llama4 (E, H, X) -> TRT-LLM (E, X, H) - transposed_param = sliced_param.transpose(1, 2) - transposed_shape = transposed_param.shape - - # Define transformation function for load hook - def transform_tensor(t: torch.Tensor) -> torch.Tensor: - t_sliced = t[lo:hi] - if swap_gate_up and t_sliced.ndim == 3: - intermediate_size = t_sliced.shape[2] // 2 - w1 = t_sliced[:, :, :intermediate_size] - w3 = t_sliced[:, :, intermediate_size:] - t_sliced = torch.cat([w3, w1], dim=2) - return t_sliced.transpose(1, 2).contiguous() - - # Register load hook - gm._register_load_state_dict_pre_hook( - partial( - _load_hook, - f_split=transform_tensor, - param_key=param_key, - param_shape=transposed_shape, - ) - ) - - # Replace the parameter with the transformed version - new_param = nn.Parameter(transposed_param, requires_grad=False) - setattr(submod, param_name, new_param) - - -def _get_dim0_from_arg(gm: GraphModule, arg: Union[Node, torch.Tensor]) -> int: - """Helper to get the first dimension size of an argument (Node or Tensor).""" - if isinstance(arg, torch.Tensor): - return arg.shape[0] - if isinstance(arg, Node): - if arg.op == "get_attr": - # Traverse attributes to find the tensor - obj = gm - for atom in arg.target.split("."): - obj = getattr(obj, atom) - return obj.shape[0] - if "val" in arg.meta: - return arg.meta["val"].shape[0] - raise ValueError(f"Cannot determine shape[0] for {arg}") - - -def _insert_sharded_moe_stacked( - gm: GraphModule, - node: Node, - rank: int, - world_size: int, - allreduce_strategy: AllReduceStrategy, - scale_names: Sequence[str] = (), -): - """Update the torch_moe node with sliced stacked weight tensors, - sharded `selected_experts` and `final_scales(router_logics)`. - Add an all_reduce node after the moe node. - - For torch_moe with stacked tensor format (single-element lists containing 3D tensors). - - NOTE: allreduce_strategy is MANDATORY and must be explicitly provided. - """ - if allreduce_strategy is None: - raise ValueError(f"allreduce_strategy must be set for MoE sharding on node {node.name}") - - # Extract the stacked tensors from single-element lists - # args[3] = w1_weight (Node representing list with one 3D tensor, or direct list) - # args[4] = w2_weight (Node representing list with one 3D tensor, or direct list) - - # Helper to extract tensor node from list (handles both Node and direct list) - def extract_tensor_from_list_arg(list_arg): - if isinstance(list_arg, Node) and list_arg.target is list: - # It's a list() call node - extract from its args - return list_arg.args[0][0] # args[0] is the list content, [0] is first element - elif isinstance(list_arg, (list, tuple)): - # Direct list - return list_arg[0] - else: - raise ValueError(f"Unexpected list format: {type(list_arg)}") - - w3_w1_tensor_node = extract_tensor_from_list_arg(node.args[3]) - w2_tensor_node = extract_tensor_from_list_arg(node.args[4]) - num_experts = _get_dim0_from_arg(gm, w3_w1_tensor_node) - - args = list(node.args) - - # -- Handle selected_experts and final_scales sharding -- - selected_experts = args[1] - final_scales = args[2] - - experts_per_rank = num_experts // world_size - - with gm.graph.inserting_before(node): - lower = experts_per_rank * rank - # selected_experts_local = selected_experts - low - selected_experts_local = gm.graph.create_node( - "call_function", operator.sub, args=(selected_experts, lower), kwargs={} - ) - - # For num_experts % world_size != 0 case, - # assign the last (num_experts % world_size) experts to the last rank - div_node = gm.graph.create_node( - "call_function", operator.floordiv, args=(selected_experts, experts_per_rank), kwargs={} - ) - - comp_op = torch.ge if rank == world_size - 1 else torch.eq - rank_mask = gm.graph.create_node("call_function", comp_op, args=(div_node, rank), kwargs={}) - - # final_scales_local = final_scales * rank_mask - final_scales_local = gm.graph.create_node( - "call_function", operator.mul, args=(final_scales, rank_mask), kwargs={} - ) - - # -- Transform expert weight parameters -- - local_lo, local_hi = _split_range_last_remainder(num_experts, world_size, rank) - - # Transform w3_w1_stacked: slice experts, swap [W1,W3]->[W3,W1], transpose (E,H,2I)->(E,2I,H) - if isinstance(w3_w1_tensor_node, Node): - _transform_bmm_moe_weight_param( - gm, w3_w1_tensor_node, local_lo, local_hi, swap_gate_up=True - ) - - # Transform w2_stacked: slice experts, transpose (E,I,H)->(E,H,I) - if isinstance(w2_tensor_node, Node): - _transform_bmm_moe_weight_param(gm, w2_tensor_node, local_lo, local_hi, swap_gate_up=False) - - # -- Update args (keep same lists/nodes, just with transformed parameters) -- - args[1] = selected_experts_local - args[2] = final_scales_local - # args[3] and args[4] stay the same - we modified the parameters in-place - - ad_logger.debug( - f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}." - ) - - node.args = tuple(args) - - # -- add an all_reduce node -- - with gm.graph.inserting_after(node): - dist_node = gm.graph.call_function( - torch.ops.auto_deploy.torch_dist_all_reduce.default, - args=(node, allreduce_strategy.name), - ) - node.replace_all_uses_with(dist_node) - dist_node.replace_input_with(dist_node, node) - - -def _split_range_last_remainder(n: int, world_size: int, rank: int): - """[lo, hi) split along dim0; last rank gets remainder.""" - base = n // world_size - lo = base * rank - hi = n if rank == world_size - 1 else base * (rank + 1) - return lo, hi - - -def _insert_sharded_mxfp4_mlp_ep( - gm: GraphModule, - node: Node, - rank: int, - world_size: int, - allreduce_strategy: AllReduceStrategy, - dist_backend: str, -): - """Transform a call to auto_deploy::triton_mxfp4_moe into: - - sharded expert parameters along dim 0 (this rank slice), - - call to auto_deploy::triton_mxfp4_moe_ep(..., local_lo, local_hi), - - followed by torch_dist_all_reduce/trtllm_dist_all_reduce. - - NOTE: allreduce_strategy is MANDATORY and must be explicitly provided. - - Expects the original op signature: - (hidden_states, - router_weight, router_bias, top_k, - gate_up_blocks, gate_up_bias, gate_up_scales, - alpha, limit, - down_blocks, down_bias, down_scales) - """ - if allreduce_strategy is None: - raise ValueError( - f"allreduce_strategy must be set for MXFP4 MLP EP sharding on node {node.name}" - ) - - IDX_GATE_UP_BLOCKS = 4 - IDX_GATE_UP_BIAS = 5 - IDX_GATE_UP_SCALES = 6 - IDX_DOWN_BLOCKS = 9 - IDX_DOWN_BIAS = 10 - IDX_DOWN_SCALES = 11 - - gate_up_blocks_node = node.args[IDX_GATE_UP_BLOCKS] - num_experts = int(gate_up_blocks_node.meta["val"].shape[0]) - - local_lo, local_hi = _split_range_last_remainder(num_experts, world_size, rank) - - # Prepare new args with slices for this rank - args = list(node.args) - args[IDX_GATE_UP_BLOCKS] = _slice_expert_dim(gm, args[IDX_GATE_UP_BLOCKS], local_lo, local_hi) - args[IDX_GATE_UP_BIAS] = _slice_expert_dim(gm, args[IDX_GATE_UP_BIAS], local_lo, local_hi) - args[IDX_GATE_UP_SCALES] = _slice_expert_dim(gm, args[IDX_GATE_UP_SCALES], local_lo, local_hi) - args[IDX_DOWN_BLOCKS] = _slice_expert_dim(gm, args[IDX_DOWN_BLOCKS], local_lo, local_hi) - args[IDX_DOWN_BIAS] = _slice_expert_dim(gm, args[IDX_DOWN_BIAS], local_lo, local_hi) - args[IDX_DOWN_SCALES] = _slice_expert_dim(gm, args[IDX_DOWN_SCALES], local_lo, local_hi) - - args_ep = tuple(args) + (int(world_size), int(rank)) - node.target = torch.ops.auto_deploy.triton_mxfp4_moe_ep.default - node.args = args_ep - - # Add a dist all-reduce after the op (sum partial results across EP ranks) - _, all_reduce_op = _get_dist_ops(dist_backend) - with gm.graph.inserting_after(node): - red = gm.graph.call_function(all_reduce_op, args=(node, allreduce_strategy.name)) - node.replace_all_uses_with(red) - # keep dataflow: red(input=node) - red.replace_input_with(red, node) - - -class EPShardingInfo(ShardingTransformInfo): - """Configuration for EP sharding transformations. - - NOTE: allreduce_strategy and dist_backend will be automatically injected by - ShardingConfig.add() if not provided at creation time. The values come from - the parent ShardingConfig. - """ - - allreduce_strategy: Optional[AllReduceStrategy] = None # Set by ShardingConfig.add() if None - dist_backend: Optional[str] = None # Set by ShardingConfig.add() if None - - @classmethod - def from_node(cls, node: Node, **kwargs) -> "EPShardingInfo": - """ - Create the correct EPShardingInfo subclass (FP8/NVFP4/base) based on `node`. - """ - subcls = _resolve_ep_cls_from_node(node) - return subcls(target_node=node.name, **kwargs) - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - """Validate the transformation configuration.""" - if not is_op(node, torch.ops.auto_deploy.torch_moe): - ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") - return False - return True - - def apply(self, gm: GraphModule, node: Node) -> None: - """Apply EP sharding transformation to the graph module.""" - _insert_sharded_moe( - gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend, [] - ) - - -class MXFP4EPShardingInfo(EPShardingInfo): - """GPT-OSS style MXFP4-specific EP sharding behavior.""" - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - """Validate the transformation configuration.""" - if not is_op(node, torch.ops.auto_deploy.triton_mxfp4_moe): - ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") - return False - return True - - def apply(self, gm: GraphModule, node: Node) -> None: - _insert_sharded_mxfp4_mlp_ep( - gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend - ) - - -class FP8EPShardingInfo(EPShardingInfo, QuantizationShardingMixin): - """FP8-specific EP sharding behavior.""" - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - if not is_op(node, torch.ops.auto_deploy.torch_quant_fp8_moe): - ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") - return False - return True - - def scale_names(self) -> List[str]: - return ["input_scale", "weight_scale"] - - def apply(self, gm: GraphModule, node: Node) -> None: - _insert_sharded_moe( - gm, - node, - self.rank, - self.world_size, - self.allreduce_strategy, - self.dist_backend, - self.scale_names(), - ) - - -class NVFP4EPShardingInfo(EPShardingInfo, QuantizationShardingMixin): - """NVFP4-specific EP sharding behavior.""" - - def validate(self, gm: GraphModule = None, node: Node = None) -> bool: - if not is_op(node, torch.ops.auto_deploy.torch_quant_nvfp4_moe): - ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.") - return False - return True - - def scale_names(self) -> List[str]: - return ["input_scale", "weight_scale", "alpha"] - - def apply(self, gm: GraphModule, node: Node) -> None: - _insert_sharded_moe( - gm, - node, - self.rank, - self.world_size, - self.allreduce_strategy, - self.dist_backend, - self.scale_names(), - ) - - -EP_SHARDING_RULES = [ - (lambda n: is_op(n, torch.ops.auto_deploy.torch_quant_fp8_moe), FP8EPShardingInfo), - (lambda n: is_op(n, torch.ops.auto_deploy.torch_quant_nvfp4_moe), NVFP4EPShardingInfo), - (lambda n: is_op(n, torch.ops.auto_deploy.torch_moe), EPShardingInfo), - (lambda n: is_op(n, torch.ops.auto_deploy.triton_mxfp4_moe), MXFP4EPShardingInfo), -] - - -def _resolve_ep_cls_from_node(node: Node) -> type[EPShardingInfo]: - for pred, cls in EP_SHARDING_RULES: - try: - if pred(node): - return cls - except Exception: - # Missing op variant in this build or other harmless issues — keep trying. - pass - return EPShardingInfo - - -class ShardingSource(Enum): - """Enum for sharding source.""" - - HEURISTIC = "heuristic" - FACTORY = "factory" - MANUAL = "manual" - - -class ShardingDim(Enum): - """Enum for sharding dimension.""" - - TP = "tp" - EP = "ep" - BMM = "bmm" - - -class DistBackend(Enum): - """Enum for distributed backend.""" - - AUTO = "auto" - TRTLLM = "trtllm" - TORCH = "torch" - - -class ShardingTransformContainer(BaseModel): - """Configuration for sharding the model.""" - - factory_source: ShardingConfigSource = Field(default=ShardingConfigSource.UNKNOWN) - rank: int = Field(default=0) - world_size: int = Field(default=1) - factory_config: Dict[str, Any] = Field(default_factory=dict) - manual_config: Dict[str, Any] = Field(default_factory=dict) - simple_shard_only: bool = Field(default=False) - support_partial_config: bool = Field(default=True) - sharding_source: List[ShardingSource] = Field( - default_factory=lambda: [ShardingSource.HEURISTIC] - ) - sharding_dims: List[ShardingDim] = Field( - default_factory=lambda: [ShardingDim.TP, ShardingDim.EP, ShardingDim.BMM] - ) - allreduce_strategy: AllReduceStrategy = Field( - default=AllReduceStrategy.AUTO, - description="AllReduce strategy for distributed operations. " - "Options: AUTO, NCCL, ONESHOT, TWOSHOT, MIN_LATENCY, LOWPRECISION, UB, MNNVL, NCCL_SYMMETRIC, SYMM_MEM", - ) - dist_backend: DistBackend = Field(default=DistBackend.AUTO) - weight_sharding_transforms: List[WeightShardingInfo] = Field(default_factory=list) - parameter_update_transforms: List[ParameterUpdateInfo] = Field(default_factory=list) - bmm_transforms: List[BMMShardingInfo] = Field(default_factory=list) - ep_transforms: List[EPShardingInfo] = Field(default_factory=list) - - @field_validator("allreduce_strategy", mode="before") - @classmethod - def _validate_allreduce_strategy(cls, v): - """Convert string names like 'AUTO' to AllReduceStrategy enum.""" - return validate_allreduce_strategy(v) - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._transform_list_dict = { - WeightShardingInfo: self.weight_sharding_transforms, - BMMShardingInfo: self.bmm_transforms, - EPShardingInfo: self.ep_transforms, - ParameterUpdateInfo: self.parameter_update_transforms, - } - - def init_params( - self, other: "ShardingTransformConfig", rank: int = None, world_size: int = None - ) -> None: - """ - Copy parameters from ShardingTransformConfig. The class is not - imported here to avoid circular imports. - """ - if rank is not None: - self.rank = rank - if world_size is not None: - self.world_size = world_size - self.factory_config = other.factory_config - self.manual_config = other.manual_config - self.simple_shard_only = other.simple_shard_only - self.support_partial_config = other.support_partial_config - self.sharding_dims = other.sharding_dims - self.sharding_source = other.sharding_source - # Extract factory_source from factory_config if present - self.factory_source = self.factory_config.get("source", ShardingConfigSource.UNKNOWN) - self.allreduce_strategy = other.allreduce_strategy - self.dist_backend = other.dist_backend - self.validate_config(ShardingSource.MANUAL) - self.validate_config(ShardingSource.FACTORY) - - def add(self, transform: ShardingTransformInfo) -> bool: - """Append a transform only if that node was - not sharded before. Do not overwrite existing transforms. - - Automatically propagates allreduce_strategy and dist_backend from this config - to the transform if the transform doesn't already have them set. - """ - # Inject allreduce_strategy and dist_backend from config into transform - # if they have the attributes and they're None - # This creates a new transform instance with the values set - needs_injection = False - transform_dict = None - - if hasattr(transform, "allreduce_strategy") and transform.allreduce_strategy is None: - if transform_dict is None: - transform_dict = transform.model_dump() - transform_dict["allreduce_strategy"] = self.allreduce_strategy - needs_injection = True - - if hasattr(transform, "dist_backend") and transform.dist_backend is None: - if transform_dict is None: - transform_dict = transform.model_dump() - transform_dict["dist_backend"] = self.dist_backend - needs_injection = True - - if needs_injection: - transform = type(transform)(**transform_dict) - - # Find the appropriate list by checking inheritance - transform_list = None - for base_class, transform_list_candidate in self._transform_list_dict.items(): - if isinstance(transform, base_class): - transform_list = transform_list_candidate - break - - if transform_list is None: - raise ValueError(f"Unknown transform type: {type(transform)}") - - # Check if node already has a transform - for existing_transform in transform_list: - if existing_transform.target_node == transform.target_node: - return False - transform_list.append(transform) - return True - - def validate_config(self, source: ShardingSource) -> bool: - if ( - source == ShardingSource.FACTORY - and self.factory_source != ShardingConfigSource.HUGGINGFACE - ): - ad_logger.debug( - "Sharding config is currently only supported for HuggingFace. Skipping." - ) - # invalidate the config - self.factory_config.clear() - return False - - config = self.manual_config if source == ShardingSource.MANUAL else self.factory_config - - if "head_dim" not in config: - ad_logger.debug("Sharding config does not contain head_dim. Skipping.") - # invalidate the config - config.clear() - return False - - if "tp_plan" not in config or config["tp_plan"] is None: - ad_logger.debug("Sharding config does not contain tp_plan. Skipping.") - # invalidate the config - config.clear() - return False - tp_plan = config["tp_plan"] - - values = set(tp_plan.values()) - supported_modes = { - "colwise", # row split and no collective - "rowwise", # column split and all-reduce - "mamba", # mamba SSM layer - "gather", # simple shard (row + all_gather) - # TODO: remaining values are not supported yet. - # They require hybrid EP+TP and/or SP support. - # "sequence_parallel", # sequence parallelism - # "local_colwise", - # "local_rowwise", - # "local_packed_rowwise", - # "local", - } - if not self.support_partial_config and not values.issubset(supported_modes): - ad_logger.debug("Sharding config contains invalid values. Skipping.") - # invalidate the config - config.clear() - return False - return True - - def get_factory_config(self) -> Dict[str, Any]: - return self.factory_config - - def get_manual_config(self) -> Dict[str, Any]: - return self.manual_config diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py index c4a93f2011..89e18351f3 100644 --- a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py +++ b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py @@ -226,6 +226,12 @@ def run_sharding_pattern_detection_test( detected_transformations: List of detected transformation configurations expected_transformations: List of expected transformation configurations """ + # Remove config field from transformations + for transform in detected_transformations: + transform.config = None + for transform in expected_transformations: + transform.config = None + # Convert to sets for unordered comparison detected_set = set(detected_transformations) expected_set = set(expected_transformations) diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py index a8b4638ade..0c386330af 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py @@ -13,12 +13,13 @@ from click.testing import CliRunner from utils.cpp_paths import llm_root # noqa: F401 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm -from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op -from tensorrt_llm._torch.auto_deploy.utils.sharding_utils import ( +from tensorrt_llm._torch.auto_deploy.transform.library.sharding import ( + ShardingTransformConfig, ShardingTransformContainer, SplitDimension, WeightShardingInfo, ) +from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op from tensorrt_llm.commands.bench import main from tensorrt_llm.functional import AllReduceStrategy @@ -268,16 +269,20 @@ def test_allreduce_strategy_propagation(strategy): # Create sharding config with specified strategy rank, world_size = 0, 4 - sharding_container = ShardingTransformContainer( - rank=rank, world_size=world_size, allreduce_strategy=AllReduceStrategy[strategy] + + config = ShardingTransformConfig( + rank=rank, + world_size=world_size, + stage="sharding", + allreduce_strategy=AllReduceStrategy[strategy], ) + sharding_container = ShardingTransformContainer(config=config) # Add transforms: column shard linear1, row shard linear2 (triggers allreduce) sharding_container.add( WeightShardingInfo( target_node=linear1_node.name, - rank=rank, - world_size=world_size, + config=config, split_dim=SplitDimension.COLUMN, dist_op=None, ) @@ -285,8 +290,7 @@ def test_allreduce_strategy_propagation(strategy): sharding_container.add( WeightShardingInfo( target_node=linear2_node.name, - rank=rank, - world_size=world_size, + config=config, split_dim=SplitDimension.ROW, dist_op="all_reduce", ) @@ -295,8 +299,9 @@ def test_allreduce_strategy_propagation(strategy): # Verify transforms have the strategy injected assert len(sharding_container.weight_sharding_transforms) == 2 for transform in sharding_container.weight_sharding_transforms: - assert transform.allreduce_strategy == AllReduceStrategy[strategy], ( - f"Transform {transform.target_node} should have strategy {strategy}, got {transform.allreduce_strategy}" + assert transform.config.allreduce_strategy == AllReduceStrategy[strategy], ( + f"Transform {transform.target_node} should have strategy {strategy}, " + f"got {transform.config.allreduce_strategy}" ) # Apply transforms diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py index ff46903aaf..77d2c3ecb4 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py @@ -10,9 +10,13 @@ from _graph_test_helpers import run_sharding_pattern_detection_test, run_test_tr import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm -from tensorrt_llm._torch.auto_deploy.transform.library.sharding import BMMShardingInfo +from tensorrt_llm._torch.auto_deploy.transform.library.sharding import ( + BMMShardingInfo, + ShardingTransformConfig, +) from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op +from tensorrt_llm.functional import AllReduceStrategy class BMM(nn.Module): @@ -108,6 +112,12 @@ def _run_pattern_detection_job( # Test pattern detection - create expected transformations for validation gm = torch_export_to_gm(model, args=(x,), clone=True) expected_transformations = [] + config = ShardingTransformConfig( + rank=rank, + world_size=world_size, + stage="sharding", + allreduce_strategy=AllReduceStrategy.AUTO, + ) # if world_size == 1, no sharding transformations should be detected if world_size > 1: for node in gm.graph.nodes: @@ -115,8 +125,7 @@ def _run_pattern_detection_job( expected_transformations.append( BMMShardingInfo( target_node=node.name, - rank=rank, - world_size=world_size, + config=config, start_idx=start_idx, end_idx=end_idx, dist_backend="auto", diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py index f1a6e5ce19..2d5e0bd8a5 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py @@ -10,13 +10,15 @@ from _model_test_utils import MoEOpModel import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm -from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer -from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op -from tensorrt_llm._torch.auto_deploy.utils.sharding_utils import ( +from tensorrt_llm._torch.auto_deploy.transform.library.sharding import ( EPShardingInfo, FP8EPShardingInfo, + MLPType, NVFP4EPShardingInfo, + ShardingTransformConfig, ) +from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer +from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op from tensorrt_llm.functional import AllReduceStrategy @@ -87,35 +89,36 @@ def _run_pattern_detection_job(num_experts: int, rank: int, world_size: int) -> expected_transformations = [] # if world_size == 1, no sharding transformations should be detected if world_size > 1: + config = ShardingTransformConfig( + rank=rank, + world_size=world_size, + stage="sharding", + allreduce_strategy=AllReduceStrategy.AUTO, + dist_backend="auto", + ) for node in gm.graph.nodes: if is_op(node, torch.ops.auto_deploy.torch_moe): expected_transformations.append( EPShardingInfo( target_node=node.name, - rank=rank, - world_size=world_size, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", + config=config, + mlp_type=MLPType.GATED_MLP, ) ) elif is_op(node, torch.ops.auto_deploy.torch_quant_fp8_moe): expected_transformations.append( FP8EPShardingInfo( target_node=node.name, - rank=rank, - world_size=world_size, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", + config=config, + mlp_type=MLPType.GATED_MLP, ) ) elif is_op(node, torch.ops.auto_deploy.torch_quant_nvfp4_moe): expected_transformations.append( NVFP4EPShardingInfo( target_node=node.name, - rank=rank, - world_size=world_size, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", + config=config, + mlp_type=MLPType.GATED_MLP, ) ) diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py index 9a8f0d5164..b4f82edcfa 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py @@ -14,12 +14,14 @@ from _model_test_utils import FakeFP8Linear import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.library.sharding import ( + FP8WeightShardingInfo, + LayerType, + ShardingTransformConfig, SplitDimension, WeightShardingInfo, ) from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_linear_op, is_op -from tensorrt_llm._torch.auto_deploy.utils.sharding_utils import FP8TPShardingInfo, LayerType from tensorrt_llm.functional import AllReduceStrategy base_model_tp_plan = { @@ -253,6 +255,12 @@ def _run_pattern_detection_job( x = torch.randn(batch_size, sequence_len, num_features, device="cuda", dtype=torch.float16) # Test pattern detection - create expected transformations for validation + config = ShardingTransformConfig( + rank=rank, + world_size=world_size, + stage="sharding", + allreduce_strategy=AllReduceStrategy.AUTO, + ) gm = torch_export_to_gm(model, args=(x,), clone=True) expected_transformations = [] # if world_size == 1, no sharding transformations should be detected @@ -275,12 +283,9 @@ def _run_pattern_detection_job( WeightShardingInfo( target_node=node.name, split_dim=dim, - rank=rank, - world_size=world_size, + config=config, dist_op=dist_op, min_local_shape=min_local_shape, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", layer_type=LayerType.ATTENTION, ) ) @@ -299,12 +304,10 @@ def _run_pattern_detection_job( WeightShardingInfo( target_node=node.name, split_dim=dim, - rank=rank, - world_size=world_size, + config=config, dist_op=dist_op, min_local_shape=1, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", + layer_type=LayerType.MLP, ) ) elif model_cls == nn.Linear: @@ -315,12 +318,10 @@ def _run_pattern_detection_job( WeightShardingInfo( target_node=node.name, split_dim=SplitDimension.COLUMN, # Simple shard uses dim=0 - rank=rank, - world_size=world_size, + config=config, dist_op="all_gather", min_local_shape=1, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", + layer_type=LayerType.MLP, ) ) elif model_cls == FP8MLP: @@ -335,15 +336,12 @@ def _run_pattern_detection_job( dim = SplitDimension.ROW dist_op = "all_reduce" expected_transformations.append( - FP8TPShardingInfo( + FP8WeightShardingInfo( target_node=node.name, split_dim=dim, - rank=rank, - world_size=world_size, + config=config, dist_op=dist_op, min_local_shape=1, - allreduce_strategy=AllReduceStrategy.AUTO, - dist_backend="auto", ) ) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py index 005e893af0..4fb9cc1359 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py @@ -6,11 +6,11 @@ from tensorrt_llm._torch.auto_deploy.transform.interface import TransformConfig from tensorrt_llm._torch.auto_deploy.transform.library.quantization import ( FP8LinearQuantizationFromConfig, ) +from tensorrt_llm._torch.auto_deploy.transform.library.sharding import _shard_fp4_weight_scale from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import ( fp4_global_scale, modelopt_fp4_scale_to_cutlass_fp4_scale, ) -from tensorrt_llm._torch.auto_deploy.utils.sharding_utils import _shard_fp4_weight_scale @pytest.mark.parametrize("dim", [0, 1]) From 9eb5a229dd0105644b39f64043faee8d8b338a9d Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:26:18 +0800 Subject: [PATCH 146/172] [None][infra] Fully waive test_worker_restart test_disagg_server_restart. (#9988) Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 23262dedc1..f8866583f3 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -412,8 +412,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-kv_cache_aware] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5726066) +disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] SKIP (https://nvbugs/5726066) +disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/5726118) disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] SKIP (https://nvbugs/5736923) unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644) @@ -421,7 +425,6 @@ unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctnes accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5727475) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5740377) From ad12b795c9d860afc00d2e178c991c31e4b51ef2 Mon Sep 17 00:00:00 2001 From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> Date: Mon, 15 Dec 2025 19:31:56 +0800 Subject: [PATCH 147/172] [https://nvbugs/5661741][fix] Fix accuracy issue in TRTLLM MoE introduced in #9377 (#9999) Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> --- .../trtllmGen_bmm_export/KernelMetaInfo.h | 226 +++++++++--------- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...eGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...hedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...hedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...maOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp | 4 +- ...iGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 4 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- ...aOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp | 2 +- 113 files changed, 293 insertions(+), 293 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index abcdbec479..fedfd7805c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -28,7 +28,7 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "3b034f4a" +#define TLLM_GEN_COMMIT "26da1b43" #define TLLM_GEN_EXPORT_VERSION "7.0.4.0.4.0" static constexpr size_t tllmGenBatchedGemmListLen = 449; @@ -23186,7 +23186,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "b514130737a69f6ce244b71f911048b846c5afd30f2b4e1a35e6899de45d8e64", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "99849863a46faf8346862ef3f86aedcde29cadd274ca51a3b66d600a11077aef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23284,7 +23284,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "565d4bec91e21246e706e7b4b24c179ae686c39d2504a5aaa785e118bf7b9907", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "73e91988d5c268a7e408254184b073a65ade6638fabc0a6db3ffd99670f99c23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23382,7 +23382,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "628f73373c822db870dd34f273edd9348c24c6deba20a85ac1dc7d7827773a31", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "f43c5284a6c222708be443c34b2040e681caf63e569307f5380e7ef36c2191f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23480,7 +23480,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "86a61b7cc55f60dc919b67348d4235a7fbc2cb24d0b57b456b01b974924a1a34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2b86ba584a3211418d87ad701e225ff82920985871cdb19884a8d82af76a7a72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23578,7 +23578,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "e95cb5eab80534dcfcb174e99363f3adf39b42f4f6c35a9349c8887f51f4364d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "ed5c5ac689db14ebdf9eb974d63a3be565e9a38d0471a0ee4a4697a7fb5f787a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23676,7 +23676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "e19cf895b27f6853e329daed77655f6129806105403446282740132fd328bd7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "076837aca361578f2d6aeb6b356fa5934f975904b7ed2af2d02c067e11338bf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23774,7 +23774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "03c1c8d8a1a0f9c9b2683d3b6856cb503636c89a73321e7e7a1b9bcac9f250b9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d1b8213a3c1e721d0508cc1b1ed7ed784a59e83db77b472bfa3bf5473e52bc3b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23872,7 +23872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "af66ed699e7a50150869f942e0c8bcb5362a26703f1b10cf3681233254ee331d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "71338ab8c3f91bf6740a5c8b97107c434a3fa04ece35ee58aa103e92a530981f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23970,7 +23970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9a513cad3d0ad9ffe5b757b891c49fd0833ace56474233ef582d15f2ed3da248", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "7b3f73ec8216843abfdb7a62d90fa13d24cfaf4a177bb0bc0244bc5e625a1f39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24068,7 +24068,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "82ef683b2e07c16adb589afa9ceac36f72d2a48608b3f1cfc6d4f607cc78aaea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "099a7ba8658bcbfe5eb136fb89263b006211ba90461f5d0c39f4f9439259812b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24166,7 +24166,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "220bc7172a923cf2e4151feffcd315a19c9bd134b75f0319f74d3a182a1f739e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "2f30bbe09b271ef773d482fc6de07c847a6fc783a240c546f913052d8579fbc8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24264,7 +24264,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fbc5de7d738ba4a5b63caf1210605c9604e6719a471a51b32e6fb15b06bbc2d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "9a7d939cc2776e02c2a06e423b58b3bda0ca4b99dd7e85c21ed020319b471394", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24362,7 +24362,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 195144, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "ce0d0ff3fc5bb6bda3cea4030961a2b48b533613b7514021692be4f1373aef2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 195144, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "462babc6b78c958e7a177ab2f3102770baea67440ef9f07fce1572b3bcdd0395", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24460,7 +24460,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203336, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "d6a7f208e751306ba3b1d3c8474f9683d10e8f453402a3df59bcf6b60ad1bc4c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203336, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "919a2c7fc0340d7d20d956681f2e1718f24a86ad77c10dde81b4aeef1d81b33c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24558,7 +24558,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "52bdb7b4b539744f7fb7fa127fefc43ac47b8346d1078f2a968f324bd25ac531", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "ef8d90ff57f531bd7ddc1ff5e3cb16608d15bbb3ae656cea302f3b11da79c819", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24656,7 +24656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fe516972fed487c4cc2eac3c9dc8e22c717398ea08d9e3c18e139124e5501333", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "358de23cfc4ccbad0a39692515d3263432eb1048554f446ba658a09af4700d0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24754,7 +24754,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "e5bdb73cc25cf91368aeee422b9a7e11480863f7c9ab2f0b7b406327f31ccb16", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "c1d382d7b57a6b6e982ba38c7485f5a709d862aa6cf4abb2340e4f8185ecdf23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24852,7 +24852,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "bf7946d13d4022501906bb20e4ae35764f8b9cc5d7de22c26875e92a6903b874", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "af774241ab78ae73aefdb096bfe4de51305cc13a16f6fe91c716e5832beb41fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24950,7 +24950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "2f36e5638183098015d557565ecdcf422bb6012507184966253a1b7447542e44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "22c1ce5215b03abbeb9af3a6d0765717cc95fc50d8f9fc9c9db90f9189059051", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25048,7 +25048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "4f0688003e9ed42086af5387978c2790e229301cfa779458f93fd4b50928951f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "c26e7a2560479d1d7663ceae50b7879f69f02e6c3cb09a3f2b0adc4757ee9aaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25146,7 +25146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "fff9baa30bed2d64333ba62177e405be7f6aa99ac2d88b15bbb2ed694350c0e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "8728726e946d06d019494560fc9a01a5af58a627b714c26d0a729edcf2cf5dfa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25244,7 +25244,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "e075d782b84a522aed1b3b873fd81112981f30a06f795dbcb6b1f15ebdaba1c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fdf8ef2aea4a0acf6ca79ce369c1ff21e2b143c768be1071e85c5b6572693084", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25342,7 +25342,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "6ac60e6bd81563778ba7092b240ca08b9b89d0f160fe5732529290f6e8620c93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "ac7a2adf51b7c24857016898fe21683a2a6fb2d1cfab8333efed8ad84f4edd32", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25440,7 +25440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "176e9ba2e406c02683cb56f7d68f3867c9ec8cf715b2cde29fbc1316ab896ebd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "9a19822767a5c2a46ed41540580722dcfd4b83fbdded277c2360adbfa7bcb60d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25538,7 +25538,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "eefbdabbe74324aac7655bd4ab1e79aee01dca2579bf96c62e1a262bab0c9926", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "92f6bd5934caae721dc1de8589d09a3b1b1e866922e02e39fb0792410e7d09fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25636,7 +25636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "501fea2fb400076ee07490269170d89a6b50bcf48c26cc2789a9cc25abe2f416", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "44e9f42c39ed21b86110f25b912bbc6543e446ac01454180e28f6f787e875817", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25734,7 +25734,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "053710f86da6620904777060d1641f2c3b0199f653ac0df9c1f22cd3afe127e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "c7beb19f4bfdabe4504b58d361f1dc25c1ffd0c0d594e8b2d452bc971d760bba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25832,7 +25832,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fa239691a9f0b851a72b3e318c1c30e9f9fc55fc40132ddeaa0a1ea04b2033f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "f7b8d06162440f47c56b3a2de6a1698f4582c55f39b183bf26af4fb82600250f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25930,7 +25930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "3161b300a7c475b00e5268e2384ac18d88302ef5812883d9608dba83e6143cf4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "1d7a960876b648c6e24556573b0157734f9386f19db900983f6f5c91fde45e98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26028,7 +26028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "b3a7c277131acfd666be1ca619ccda5296d1b56be0d71d8dc2753a810e6432f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3e6f77505018048f875660ed00436f650b6119aa4160eab0173fb88a88ae2508", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26126,7 +26126,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "7a42d4b64d30ed3c1a5f2c548a68551e05a2f190a68bd750a5ed11cb54f5b533", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d95077f0a738fca97e6653f94d3fc14edf0b3656f677fdccdd18a5d9898e3eff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26224,7 +26224,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "8d007ed4875a2aa963265724d5df3afbb38251b49773cc720b1264af27926b89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "42c8f6f2c6ff8fedd5ab0429e20ae002f66a30ac397690a4d07415e2e14df05b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26322,7 +26322,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 175592, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "23be40d1a888e26d2aa4b809c3bd577083d9afae662c68ec90a158f218c6ba72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 175592, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "e98631a4c4573ec4d698c315aa900fd687f71c415a50809f4101acb65ddbca53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26420,7 +26420,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "7342d03360798f1a93d009b82249987714ce0c41a3a990242a646a6dd16360a6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "803ae0047bda4d84fe891707ccc3d714b7844d43913ca022af349082416b64e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26518,7 +26518,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "b2e746473c4fd0d405548bec3d6d519cfd8eba8ddf4f284eaa7b5a633dfa7553", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "7379b638e01faaa94ddf9805244e2b4395a45c4348ea64ebfc18b06634e69b3c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26616,7 +26616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "1056f78eb3fcc264d2e4c6ac9636544a746dcfa9a9cfef78d38d039954a58634", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "de49127883237258b1e079d0f94376db6f83fdabaf82984f4fc185e23c38bb43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26714,7 +26714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "6e385ccd2a730a078c52e4445e889a7534194b38122df4d2d878d93f91f71c65", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "5c9a47e537d1622bb617056666b1e92f2c305fe9f42422f25cef74164b6d0573", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26812,7 +26812,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "dcb0a1d7ce7b8ba55c8f83ca96b89f2e01aed0ff096187d7db609ace958cb2ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "5a3f0b7fc546f2f2f6092af631fdc4b3eac64888e05d73f0074a6c122d9524c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26910,7 +26910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "e2a29a020bdaefa3eadb2063f1486657ac0a985852b5ef46a2de7986f6457fd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "f27df8e380a6795a0a2bf02f4c162c0a363c6ff8913aa6b68fa3c9608e71d458", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27008,7 +27008,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "b42bb7f33a0f964c96ba7292ceb9b92cb7fd9732e3ae13e4847cd4cca751e334", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "23af9ed14b19921c3e09ea4402ed6a0d6260e80b67903c78400b82c5ac3e6c21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27106,7 +27106,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "0c23fa7d6e3794d03a8c5086f1f550f6d5a192dfc428951ea4277e6b355fc607", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "e90958721dcdb1f2332c06bf551c52065211c4ef513b9b0272a2d094540d2764", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27204,7 +27204,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9e98df59fd353b05842cca9ad378fe5f2831a207d7ef37bf81c3d17ec1293105", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "aaf22751511376b319c4a114fc1e246c2ce434df03712eb937b521f28fbbd2dc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27302,7 +27302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "1659b5a9df68e42788620f6e3a0e24084c87a47af86c0685130d597177074006", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "f0f254417cfccd99ba8b71aa105213f7a2a9363fda391f6f0eaadbc4e54cf361", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27400,7 +27400,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "e68e0652a7632ca05352d6da8bb8cd54e0e271ae61fed962a397f3503c8602a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "a4963426baf1893bb98d4ed5bdf470ea21a2711f1a40adcba2754f501b319348", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27498,7 +27498,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "1a0ad45c61248040df2c9ff30af4b3a42f617046ba9dd403140efef926bf3dd0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2996a4b05202cff815ee849be955bffd764dbd1908e009fdb296cb083a34291b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27596,7 +27596,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "54ef1722d0bbdf88f2389447bcf13b13a2107c55e6824ff1c40d78c300ebda86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "14c208a2daa2d1fbbd0442eede8e2a1ae50b5ec23c7613db1ea31a2c67039433", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27694,7 +27694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "14d4c46d1710a75e9337c104cf63b008716a70785a1af0f9678d7cdc61b321e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "0ad3e2bcb353dfcc6b177086ce0ca5b5a0cfed05cad86b7e1281c15cfeddfa72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27792,7 +27792,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "da45991bf3a23e3388fd84812a5c34c9e8658ef48a1fb5165bf35bfd7d49be17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "65bbc10b5793547391b87fbe45cc6b8618c85fbdb4d162ee113c5aaeb24ecba0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27890,7 +27890,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "a84414d643013f4e879c57fc91ef0952e23ab692e77f702e956dc5224acc8358", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "2b66fa746c4c8ca5a409d07771d50fb2953df1f56960cebf9727104d0b663e5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27988,7 +27988,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "0c3e7b581df76635633831eb0b85ea94e43fc22083e22dd8441efa4ec4a4c411", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "2e3ba6d551c48880fed02a132e24d63a747c5f3fdb83a786c7f14e2b69423f9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28086,7 +28086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "26fa38b5e6bcd1179811cd894192956a7f1dceddfc559a32800e6a0f5b591bda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b3cd92447aba0d9b5857977b5960ea8f30718dd5cee5c28833088d21b54c9000", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28184,7 +28184,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "c577f2a97805f72c22b85a43260af6d5eba172198ff988c143c9ef37ec9724fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "78b4d51709397cdca345f7355e623adc9117d9af48c16d84c2824833c9a3b273", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28282,7 +28282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e22879490e65b449cc6a03ddd7b4db0a3e7bb59a13ae1c5325b302eb2aaecc1a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0cf5d85329227e2a8cdd0f0aaeec6e56495659874dc4dd1ef5c9a6552bad1f84", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28380,7 +28380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "80798aa53d76f060dea150966d16da995fe89ef02866d1bd9f8c01b275999931", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "15f627def4cf932cd5285432d4f7e209c67376c4b4bf7e7f4e37b5cb2086b15c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28478,7 +28478,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "df2976401bc9b3ebfe1ca116be0f2a9d351b82f21941aba5a7fb3bda34ab9d88", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fa09f5da1cd0e9e8c4836f4fdf568337545034b17b77d588c43a1d8a2019df0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28576,7 +28576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "c8de1d41b98a62461cefda26bf66397da7420c8d1c9111709aa53e5645bc9836", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "6d8f7aa6f531d60108215991f3f552daa011ffea9176b28660a034638e0ac8de", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28674,7 +28674,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "65b03fe9034f50e5a18a4cab39e616d81ad1fec9c0b9d55a12e55bbb4a721a2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "151498bb339507e94a8be7c5e0f3bb74863a70094cbc892ee81c62840dd47afe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28772,7 +28772,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "d00e2a1807b1e30ea3725139daa55278e98958a1b0773d33349286a82642a90b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "454483bd6cbcf6648d73030714f6477d355bba25f11234103d33f017f6f2f693", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28870,7 +28870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "a0a6feeaf1f3510e7119942387cee93ea5ea1d9e202b1bf159c8140cf6a93421", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d45bef9a9bcc6348d0080b9fee3e61577dc160ba96b7d4fcc82cb4ef4246650d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28968,7 +28968,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "a12b8b5f736f2dd3d77b524dcf72e257567a4fb0538caa2ed0ae6c9922b64b0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "41db746a9e6df40dabacf8ad20207ae871ab8782a4ed6644fcb0d159bb3a6e52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29066,7 +29066,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "c2a79eeb8a29a7a71ec4743a4f70d793d68b029d9a9f6d7b7dbea2f6a003423e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "c1ef21fcc73c176018a6cbc3de173c2aaaf65bc9a10846597312efed14dfbc68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29164,7 +29164,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2e4dd79a7cea90b3951092522a8305037c3ba8a9bb2841b0771069bfc3836431", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "237368117a87c413fc3a2a5b377175309e00d2375343da40f9604036ae71789a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29262,7 +29262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 209576, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "1554c4f71009ee6d6e50819f3ca40e7f7293a7154371268c2130cbb1b189fcab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 209576, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9f32fbabe4699db11b43b360193f89cd74ddb3b39baa6960b3aff456bb028d50", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29360,7 +29360,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 213672, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "b549c361727fd31aff5812467a285d8e85bb8d9a6146115c5b0aa05f3ab0b012", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 213672, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "4ea22fe0c4ab3c999bab68eae1f992d1914780b9905a1552382d63be7a734c30", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29458,7 +29458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 217768, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "7b88136a008beb184cbf393c8aaabb35bd1362f29ed75f8f1a96c775072dccda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 217768, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d5e60c6438e4297c0bbf2916c9af69cac35a3063bc99b3ce5c076596d7517b6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29556,7 +29556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "842dee6e49053bdc14f7545d97d0f0179ea35a1ebfce61a045dd16e4361c33da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "f7cb4bc4d6de203ad0cdbe3cc3363584f69d29f6e2439f22850a7afc73b48ff0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29654,7 +29654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "afbc7e1aa7e0c57b365d421073117d31560182296af081399ed2cd92e1df6805", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "aac276d421a4bd70f1d4beda2c652abbe8a3731b1cfe4e2f5337902ce478f6c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29752,7 +29752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "33f7eb6e1954ed9fae50ca8b45e4d718c035b94474b5cd014c871b83d0a9d8c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "91908c09d46fddec90ecce5f217ffd3c34328d383a4f8c0dee9c8c23f397a724", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29850,7 +29850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "4b013265e6d23c6d4dcd353a7857c504203fd190481a2e98e1af8ad3da143a08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "94c435f10b0b432236abcfbc4d21e8e073d4c1094c9bbcfcf6270a807d30270b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29948,7 +29948,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "23b1698bb6d8c2c7994fc562b0722d768c61402fe2577adf75a17838fce17c94", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "71976c7652b55db95f9bec20fb6dfca5d0c5e4683e56763f064334ef82928c14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40826,7 +40826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "a9ff314f96f5cb8caff8698429e3dc3a0deddebcb93e0b9ece7cbf001f7f0bb5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "0d14029520ef6bab556cc117f8493822456ee9a95c222741358833db055b1075", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -40924,7 +40924,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "f746a52f00b35727e770c35be502482772823289f0ac6c67e4293f241350345a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "cf33593450a1245c8f2efd0ef81638c8ab18e3c9e4e2c23ea212b057aed7b400", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41022,7 +41022,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "970fcc5d3e1bd23db480daca85881fafd226e0bd63ee1b1132c83efbd06c3f54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "814e26e7f81d793b04050f8f8b9c0021cb074c5252919a747c2c3e273fed3064", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41120,7 +41120,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "e7305350ddd3fbb01ca5fa32f5cb712e0c9d825dd0d80b9b4c4811d64adadfb3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "d5828fad486e2d3b1e60c6a06ed97f811ab3e395be41ec0031452bda8d37b641", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41218,7 +41218,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "c24d4af88b38dc299b0a1072e61966324a43a657ab201877d0f1f527424741be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fd67b5020bbfbc73b4b2363449bc970195fb15f252b6faf491a1acd7e306d9f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41316,7 +41316,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "1ada89c9f3bd54dbddd322d9aa644b2e584849614938cbb871cac8b1eb755c85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "37e257aa6d0daa679849bb6625a00fdacdf21402061fadc40a27c70bf8c2f546", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41414,7 +41414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "3938f7e9af10a1b6c1ccbc8ff3c5b44cba790f9c537ff5e609690f5def6ebdac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bc65f27199b53dec7b79fa1dce32e1b705a694317a12980a67c1e512538b5b87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41512,7 +41512,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "4f9083896de30f1938a94fcf041162e615d5a801093dc6bfbd4a9b00b4dee393", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f67da1df19177fbc0f88eda5dbaac7cb77cb21b881bf8128218fadbe31b9adfd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41610,7 +41610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "380a294dabd3e8e9d18de0585bb9fa2714d9aa33709c35618a7001dcc11324a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "0bad83e01399d39a6515e6f607a149598f99907eb79c24b31cea4ee089105c93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41708,7 +41708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "0f67ced61034c6a3da4dc64c912076cd3e8f2e98de3902ae934d87a2cd275b17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ef7cd280b46d5197e5ca8ec8aa70ce67078cada80c27a8f503b321c8536002ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41806,7 +41806,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d65cfaa78cd5f7e8c3f367a011db1c1eb8a8b20b1d2adbe58e596d9f08c42d15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5f44845ea9cd530a8a2d1ea36b1eb7c0aedd88a029f526284b56f9342df82105", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41904,7 +41904,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "23858af9b3c42092cb6029b0d7d979c7c70aaea3bba7b350dbd033486d6415aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fb3d9d3f19f8b6518f7f75c6e7d49328353da7e39b40fabd6382e1a657cb049d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42002,7 +42002,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222872, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "ce3574293e0088cc7bd7af9b7049e5ef719e00c81368dbf994c65f9b779714c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222872, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "a4d0481422baf757c8b7b8d5bae47b7dd674f2a6b734e6f6c888db2c003f4322", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42100,7 +42100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222632, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "0890c16a871917585c223bd711b78efdc668d48ef3b5e6660135e51518453f79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222632, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "c3169a113c9413d3740b4947023cbc63718d7b7dc84102da10a729a09367a120", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42198,7 +42198,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "e5d62af8ca7104af51a8655eb61142db7ded855d2c4648ff605e64fb21adfb5a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "65f55da14144a93961169c6235a79587bc49408beef80a0988cf83dc70c252a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42296,7 +42296,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3094ade13b8fe50e9ee687548554b0e369a9eeddacbb2db02dd08fc88f734869", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "6e9af2a4f54abebd598cfd4c7d4a9dbde91c0475d1494e133b1a8530983785b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42394,7 +42394,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "7a61b6ec7b5d90d84ae253156bcbc718f96f40875525f6eeee6e62c28cdb973d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "66bc1b722c0091d47c70bc10cc006aaa5abc187e0570e3e25ef87cc80ca00422", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42492,7 +42492,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "8a5a70b1623b8922c08f3bac76ce49bbca03e8af0d2585b0d98f304dbe54953b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "a9f2e0595820bf21449197d30fd3a913daf7220b6ac36c41136168fd004b1ff9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42590,7 +42590,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "36ac28efdf5d59cecdb271fb18d1a5e12ae57a3cd9a9c02ce95f906259e63233", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "51435f11485ebe225ade4df64ee2e2352db68365311eda6be876083dd8db1a54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42688,7 +42688,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "383de0e2df6d4e5549af4682f500f8f466f401307c31b94462cf52d6bcf12633", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ef09c1f727ce89ed028aa14532cdd4452987e108c456f5d18deadf4fc59f8327", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42786,7 +42786,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "f8d9e1e5ff583c916f640c4fda4349c4fa1bf700a4774b32fd6cf0ad7c56d298", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2a4f1a3ddc06692ce1a3fda3ca22971e3e3c4c2b00138aa311819f0b10889e93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42884,7 +42884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "b1b7d78dc332d285205dad11d5314d8b415727217ef6937fa38c02b26324c546", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "5daf0c0eeb5b24f564fc130d9bb1417919fdea27599c760285b3198e2580ad05", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42982,7 +42982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "18e365fdb1bf8db5a26547c6e05a5f97c29c0db90432c90ef81f15990f76e8ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "ba1641ed203b98fb601053a252c735b67f51faea7d3ff35aee5382987e8aa40d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43080,7 +43080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "17a38ee5b53647f8fd8edea1f3351d0477e0880525cfb2e741a0457b8c94eaea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "bc32aa6812865276fb77da21559c8e40d5f547e009963eb1af66358f6a254263", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43178,7 +43178,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "f2a833c78a9dd42859bc2341b3f72fb60dd72ff3d131f953acae669be023990f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "89c1f73e5df57d257672da8acf59c3db0c2ee1c6d5eff541bcc500d62fb6b7e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43276,7 +43276,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "667c54081cf5d7a23390c4a465afb6d1f87b68bca2a4092b077fe1b7189420c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "b3fb2526d8a9a3cf7481c177a5071fa5aa2efc4fe57d70bda2a5096143311524", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43374,7 +43374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "9c5ac6c33f71d829aea6f62a8c6778e68b5f574e37cf1dc819c249b9737d4b71", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "97f71d98c5726366cb60c1d57b0505aa538e5a344e5e17583f21dba7441dc76e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43472,7 +43472,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "36d052c0c6c8c132c8d6d9e5c2ce1724273b85337929cfc63b83d724e66581d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a0f147bf4935918785d0bef4af0bce5dee3d3cbd31cc9801ded71f8cb56f25bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43570,7 +43570,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7cccd987cfb677be68c53e2dd1ac0928ac6357227a139c87e69ae218f8865643", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f39fdce01af9976568f09221622183d24c41afcc249c7c99b6cf07eac0a200e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43668,7 +43668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "89720ab4bc9efb3a2b718456d354e5a09ce50545d4ebda9e255218f15891f657", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a5234473615ac97ba11fb43421e7947a49c61f2c24afe9c2d7bb68f3218955d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43766,7 +43766,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f7c4bbc3874185536e7198304ce5a2a4346566a67b8431e85076b0c34738a2e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5f959dc0901f4542c9f7aed071ab378269264ec8fbf0de7fc319e3df1f78baf3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43864,7 +43864,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a582dcb8db25d28e4e24abbfd511268a214f5d7646df27d43e0aa8b3cbf32e6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "88a96f873a91cc18a01cdd9d8a52bd9850927a75e19a0d23ce90ea6ede9331a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43962,7 +43962,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ccb0f63ce87660352a30aa117baa05858f5004cde9f80fea0dc88d1eef14966f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "180f9a53a969754a31505d19fd5bf1af854737bfe97d02198a399d94ca60b395", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44060,7 +44060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7a122c452137b4b2c8c51a4b2ef98019ac04f031034eae91bb6c5a1cc7f5d4dc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "662bbc317aa520bdcc8935a7aa0a88780fed5f3aded557dca5d1edb10f34d542", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44158,7 +44158,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "6e404c9201677c773f481473b57a5f6301a75949cb5915718c16c25df52e7670", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bf49e0d4b5ab6d8459cf9bf60002324269464b655ef5603d40de3289427285c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44256,7 +44256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "14b27c4073212d062fdba634144b39231fd407e96ab0ca8e38cd15c1aed7365e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "af5b793609d158a4fb4f6f52958f7952ed663ae94f78eeb12bafb16c3f6f0ee2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44354,7 +44354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f05ad8f69c19ce94cee2f8dcb05d854f942fd42165e6f73a9f24014c165beb13", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c1971c8b1a4bb8f53a8e79a5e8db23472ad194d5d6b286846866a792338f9d4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44452,7 +44452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "82cda53aa11558cb7bb939d9b474ded27bf4f212ecbdc7e497ba2ab6f6267224", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "726f43adbba0740a548a63eb1192dc1a243abd3e675e9df629f16e5f10aefeed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44550,7 +44550,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b246f53763b5b501d330ad8737e3ed4ce2206ff88232d5f8924240c58f045272", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "6261bc8b5b3205c9920df28c20ff97da926137d4c9c1bf54bbebdec514a94601", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44648,7 +44648,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "9ec40431f477bd04e0ac1ab649c145d858e60a59e48819b48958056a6d0521cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d1ab2a787ac06a2b5e70ebab08d3b9fe585a9eb82b8d562cb099e817e4ddb727", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44746,7 +44746,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d46e557151d9486632ae9cef179f3a591dcfa63068fac12957d0569462131bd7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "460d887eb17187fa7e2ec04b7166f46580f55e3e0a6feaaaa3f30f8f39fce763", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44844,7 +44844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5b674b9a08a0d0f029cc428ddbcfb4211524b038c19ad7738fd231e321729fc7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "58f35a6dc20f4bf4aa9aac32ed3080a54cff087fbe01277d3b25e413e7a82f16", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 5b3c8c5d0c..09ee7ca789 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aeb5cd123bafc5a63efa28435422b132354c7ff3d3010fbbcada752a54b7eb00 -size 1157648 +oid sha256:25bd7f21415ede7acd0d96423d6ffaf55945ee0d8231996f82d821970cd49128 +size 1160804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 291802adcb..9cc4b2e281 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:057f760b9976712414d4f2d48114d2855bbc114f9e275c9ce02e4e79344bcb6a -size 1134858 +oid sha256:6544003fbf6ebd0322ef6fff4b888607a8b4af20d5d79e2abc29273c271f3faa +size 1138064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 6d65529f1e..2691c14bac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af6feb5e3e7dcdbf84b175e3f95e3c71679ec81fbee1c08add0437282a7ba567 -size 1172156 +oid sha256:27ab3fd0dde9b0b693d9d77603d1b5639651fefa71f722f94dd96faf4efb93be +size 1175362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9fd1946366..f1341a15c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a01cd18f1c60dd8cbdcb76351f3d70a6bbde4596d1740df688664503aa963a2c -size 1150204 +oid sha256:603bb29603b9870df3f9e738271289e90691b585d09969e28c9fac8733cee25a +size 1153362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index eb4dd04730..3e504f35c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48ece823140284fa80a98866bc9972a75a192c0a5e18dd49a408391348f220b8 +oid sha256:a0cc765c518f5a3ebc0ca533d702a6637d2f307ee8d805b73130fe6f5596d07f size 632096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 1a256e1c27..7a85482d79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19dd3971002732fcfa0dac8f2ee9921feeb7493ec7f3612c66f6e2a2b302b0ed +oid sha256:b8af522d4aa4ce9353782d9f0ae15f4bb0751a03de17138cf284a95acd2b61ee size 629878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 65d3745748..0d1a105226 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70c41810e1221972acdc6cacc8b58fc8a82cb50b3f004fee7dc36879b303ba6a -size 513977 +oid sha256:c8b755c5f0145d93a223887fcd961eb2d3ec6907c79c6f3ea3568386668f50d8 +size 514765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5c978f03db..0ef7e7d67f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0df6169b607713ea44bc5b19d7c6c6a2230bd9b99c28dd444c48a8080ada3853 -size 511759 +oid sha256:dedc17afe8442c3dbf7ea8f883a19d0cdcf85a92161991b53f0f66f63f988814 +size 512597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 495c2e3494..a568b8e00c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0451e00277a31f78c563b3e4f613c3b0174dc0b1b6657e9a6ab8df2758d33c59 -size 652968 +oid sha256:f29a3484448fdb88d692b59c3aefb8763c07aaac980e4426f338df973171470f +size 653016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index cc80d141a5..3288e77077 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a06189c11e07041b3bfbcf051544572e1213bfc343652b15e26fa611020cd7e +oid sha256:eaaba34ed940f47006c8bb42b313904de0764a1df2a526061fa31ec2cd5ae4f3 size 650798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index bc547a7058..1c9b3f0ba4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ce4ac76e8794198d659b9e3fbf509d87b914388c3c12f149fb2c657d254288e -size 534651 +oid sha256:4d7d5ffaaf3b5c496a7135f0088951dc97592b2392e29f3383e131f0cf8715e1 +size 535441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index aa3bfb3cee..f573eb1b89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19544782acce66ccf44769b9a8091f09f089a0a9b7e7d8f45e6599718e78b30c -size 532483 +oid sha256:0f11e5bff1bd49edc3507932e603ab23193c6034c010b9eca61c6226683ee1ab +size 533271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 770a3c5350..9abfdff4ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcb7bba2cbdac4bc67232778793ebedf2e9f33c98099764e9ef40ab0e80109d5 +oid sha256:c2815e45c528adbf17b3619de2bcae24f7e534e6503d546d4fa8b61926a4f148 size 684256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index da970ce257..c6e1a520d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fc1d6546022c8e83886df83a40aa2238be7a64923a86f66ea6a990d778dcf77 +oid sha256:271400e2a6e1161bde9257fa6f12006debbbeb4f4ee0c996f7a1b6ab54dc9146 size 685440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index fe41a7da38..872308e2f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18fe044a3a922ade9ee80aff4226628792c5311ff3ba9b09d27be16c1a89ed7f +oid sha256:346e630da02da9113f2298f6a67a3a9267764f4112e50a721f89dbf94fea4630 size 618824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6ea74cd324..75ba1b1899 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4b72a831ecd127e42109b892ad84e603e9484ba1f38758c85b62fbd875a9513 +oid sha256:17f8e1c4853c1c0f3ebee3713297116c0bd243a2538707b7f90bb5899186c820 size 615865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 51034a36bd..d7b2b00ba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dea317b40bcee98d130156b055943d531bac696398fa9b5880ce404fa3783cc7 -size 495033 +oid sha256:ce679c7fb7955f75f02bff0dc3b3bc8ec643c74d3b311b085d9cd72e68723930 +size 495821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 2305fe45aa..27d3d93a64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a804031198a6d746223d2bbb462cdb47266aa6b9ef236303c2324225ded77ed -size 493505 +oid sha256:94d886c4b24536bec8dfb07b5371451fb6ebdc14068ec915e7992325ec8c61e5 +size 494295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 4aeca3e30d..44da920ed1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e74156590dc36067b3e2659aecec6a1ba843b7089ba171623593c7adc53bbeb7 +oid sha256:2cf52cece67ff1bf0eecbc62ac3c758a69eb2f0d6e42f098fd6c2db460b0f8b8 size 643496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e487d0e69e..998cde6e2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90b5086e9ffdfeca3cf0fe412bfb7bda7e693eb64f45f71e9e471cd9f3450714 +oid sha256:838e6db0aa2b993688eab3d3bd2d32266805e50d4b04f4c885e1c7f0a4082a2e size 640538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 1c4872b6d1..0c6626be5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80c0d3e20d64b57231235d989eaed82e87f948f25f10f6047927179e155fbe88 +oid sha256:6e374e9f129202afc1b6a81399f6b5d43aebebe8307302d762f64584c7f6e105 size 520541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index ee64488aca..5abae6e3a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e932a474574c4e684769fa66c87bf300dc91814004627358e6a3fa8801622797 -size 518373 +oid sha256:1233fc079977f2dc1b3af3f9f8a1b54fd2d4abd982bb57d2c1335fb085616a31 +size 519163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 400f193ce7..b7520cf416 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:287b53f4a811c355f389de46cf258f4f4fd14a6191e7a503085ce5e3bdae1c3b -size 812848 +oid sha256:4cee02c5e09a3905eb75dc530c8ca775e85346f4277bd5d4d169b52206844e57 +size 816004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index e2ec181448..d0957f916a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99912f9700ab6216ed809865c1ba9b78fe02f7102c9fd4a26eb1a5881cee2562 -size 788972 +oid sha256:2525e3cda0dae80f08c50a57f87f3927eb6e4792cf59eb0487bccaa18d78563c +size 792130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index f426f3b23c..f602ac986d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bca75ca5ec82d635bd7a7fac2ebbd86aeaac6c64c9214334cbb4bc75d0dd777 -size 681380 +oid sha256:ccdb113e7f5cf40ebf310f6629ba1f7662180e9d6dcac5be99310ef83526a05f +size 682958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index d21b7b0843..2af72080a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb3fb6bfdc57aa3a53fa071e5a9a321121b678fef8b738458a3cb93ff5099b12 -size 676202 +oid sha256:252cfebb26560acb60ce716366ae79c3839cbe09b6d16da291645f4cdb0e51f1 +size 676990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 97ace0712e..eaba052d80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a788f4ff73986564635560926a33196feb0fd57df75b4e0d23f7eb7d7397c92a -size 566961 +oid sha256:a2b59b5175c3af0e59fc9fecf06f81c1778eb649c736c97bd57a2018a47120e4 +size 567749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 74d66428d7..82c37db903 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8561b171dd8569a6d3384331c7521e97776c973704fe37c095ec763719a25819 -size 560993 +oid sha256:308da1fac2127b4e8dbac11a355c18ee43cc59b7876a1333db158724b8453aac +size 561783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 20b67408bf..7231399f4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f87806fd2887e4bdf3a219a78f30485f000a56f0cfd807d284ad00226b44df2 -size 700574 +oid sha256:def868a7bc94ca91c2d4fb8f4f89b42adef703ebd7a5e0face1fecee9f6f76f2 +size 702152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6fb034a1ab..c6b5fb2d89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d64c28ae9a171f8694ac8051523388f7503f905682ef761a81638703f853866 -size 695396 +oid sha256:8e6c0cc40976a3a93e0a5de8dbdd5bf97a7e60f1a4bb02b3f810021cd440b88b +size 696186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 38ae7e7dde..043778a527 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c791b9c7d1b87298d53eb0c9aa12fbd7cc36a0276adbfc294ff61cf852ec2ccc -size 587635 +oid sha256:2e16172a6b1021255baa1a3edc59615847c50d59054bc4acd1f43ea969526f11 +size 588473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 859804e0d5..476440f5bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d6f7641cbe4a4a168cdea8cb003a52bd08097e45faa2a7498678c15d2ab804c -size 581667 +oid sha256:3d5137a5d4cfa02db667353309afadae951de2b6c212323776001a8534e40a12 +size 582457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index cc7805ec81..6f7d8a8322 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a57334b74fa5cae3d2c27b352f5a206bfb691bd5fd162f78e5f652660cbd8301 -size 732702 +oid sha256:b36667d2037857448b352e09506093cd630654b9f04500dc92af388e5e01d3a9 +size 733492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 709c21ea30..d864cb5782 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15c7aa8ecbc464f2dbd2f7f199b277402d1566aa1d51cfd084d9780199d6fa0d -size 650102 +oid sha256:3d06a961fdfcece5e9d9d4317f3f37678a3966e5a073e4848efe95cb6d501450 +size 650940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index baf778a2f9..37ae55d12f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:806579ecdd2f26c792e63fb7d9371d0b71aa2a8f7cb902376c5f3b3b2d4edb17 -size 644924 +oid sha256:8ed70ca51ab999b5ee4e63197c2d6a369bb5068420b9648f6466e88798191cbf +size 645762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 3f643b4b3f..ee9cb0bc11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:102a311c689a2e9cb266bf7e2f1235f72d3d26c96db5a7f36da55000f1684882 -size 520489 +oid sha256:afd2154fde1db9e0e32fd1f5ac71d7d562be69590ecfa6cdb7480ee03560fad2 +size 521277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index df097615c2..3eebd5be9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97ebf0eba0c722f2e2596559a840f1fd49d304de7404ee89fbae9d4c730f5b96 -size 514521 +oid sha256:70340573c7fecae49d980be60094fc57e02beb312fef43e60c27d42b346ec3b2 +size 515311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index ae0b0967ff..62754e637a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df0a704874ab21d43aec21c67f5736b94ad2c5eced2b1d880157112c87ec9d9f -size 673392 +oid sha256:7b2f57faee1823df868b3517e75e6d0a4ea0f43cbfa99feae43d0980ae82e3f2 +size 674180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e9189fceff..69479ab409 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32de64004e364baed2a44fdb101cefd9cdaaedc1b811b299dba1c40edfd7ac09 -size 668214 +oid sha256:1a4bcaf79c9386bcac8859748ab2edf688cdc4b3c4f9cc2db12b3b0484b961a5 +size 669002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index e26ed7f99e..b26a09ec09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f031327908768921070236eedec46f4d0e32c575afd330b896c73d5b04d1196 -size 545109 +oid sha256:42ae8c62785dd082aadc50049de9d06d2799d945fb0e0aa6c9f1fd74d81a299c +size 545949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 053c606f93..95934d4000 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80234fb9fb33f068ed41a5d86f27b8d32b3399aa928a76e9c3e26c551e1dcd6d -size 539143 +oid sha256:ae5241c2c9e1a9015f065ff171ca86e5f25db32aad674cca0de5566bb994ed1e +size 539981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 561eb09f02..d1adc9db19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1ecd7f2bb30a8ed29be8772bbedbbf782df4c483fea1e4bccfa9d92ec5eb440 -size 892428 +oid sha256:aea2ed55b50ca3f6911fd5473afedd3cb18eaebf28a757c8d1bcb38cb7db39c1 +size 894006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index b7de9f6ab0..8c3067745e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cab2ccc80dde1543d502e974107e021b3deefe9c2e8664ea51836b060ca430bc -size 880490 +oid sha256:e06b6194e59762423d96a26335e1ea362794d2ca777058c0e8626c9a5637b420 +size 882070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index cf7a98fc33..b922591289 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9463b2e859990647e3592209cda55319ee51c384641dc30554b94c82bf03a287 -size 913546 +oid sha256:58ac26ea07a67900f2fc38dc3e720d29e714e84293d08d067b26926cf795d9ea +size 915124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 331b97d040..5908fecbaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:033851d3c22e634d3890b007d03ea9fa99b62936ac1f6381470cddddd43cdd6c -size 902398 +oid sha256:ad9cff101f5287206d60e4553c0ab3e312809a932b9dcca5e6c772fbfdc83776 +size 903188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index b1c7e82292..051130575f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0862ee7fde9f1ffd8d2532069b7eaea5968867691b64aef8e00926e6d47ca5c -size 718528 +oid sha256:44c8f6ff26a0ba6f271aad7880415c1551182b5a8742aa60d1a1bb66f6084792 +size 719712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5fd85f6b16..abb628d8b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a302565cace826cfa6b8905b0a9e0b471e7da1f10cfd639747b02e0c0fd4417 -size 707380 +oid sha256:de9d34222e002049954ea75bcbcd8b926710af7013a057ae4a3362bcee41e349 +size 708958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 727f8ed81c..9a1d12d4af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b339e465a0a4aaa39d87e646eb9fe61851f240cc2c5199ac580b078ec75a9a5c -size 741964 +oid sha256:9ea990e235ed8263b96de50411ac19734e72477af8e1282d45a5f7f086011390 +size 743592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index df628616f8..0c0400cd38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4704d962e50b64f08cb1e99a59aef24a034497447aa4d0f8d9817c2ae011d809 -size 730028 +oid sha256:89332b6ce9aff5e0a166a1ea43c11605a6e77671bed4282f0cc66478cf2613a0 +size 731656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 5d6fce64ae..16626fd116 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d37e918f83814bbef880284974e9f3861ffdaf4f6dbef7b557da1355bcba910 +oid sha256:da4cecb948f63c91cdcb9a59dd352939c329ec8d8c8efb7cc67ba7708090c809 size 595533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c53d6094c2..8aa6cf471b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:149b1789edbdb368aa09fa29e903e096c42fa295eaceff845e20c248f026189a -size 594795 +oid sha256:28581741a00bffcf381177c1a2c2c155ddda542ce478c213a0214fe89424e6f1 +size 594843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index e67f0ba238..9cedf810ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5d6b2f8ca104633ef1d2857aa113540503412fec899e36688ec4f3374b5a6b1 -size 490587 +oid sha256:22325bb077d2386de27c311df1613712c97cd1d108de0d4332c0e53ba6a2d20c +size 491375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f205a81c75..c1df0dea8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaf595c85618cd4259eccc0448242d9b3e14a42116caa1a2a7589a15e3561602 +oid sha256:044d83fe49aeda8198797ebff786d2db5a7f589077be04b00fa94edfb3c2b2b0 size 489897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 276a85a178..da26119a3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f70b80a052db06fe822bf38a0f1f31a17ee5448ba5a22639c6e9c230b085d17 +oid sha256:cc712e198774fb458288582f4b9601d75e2d330d8121c110c0d1170143b1249e size 617293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 3fc83049f5..5a08c46644 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7cfd5027e7ccd49ab475b15fc3d450c6d6dd91b9bc9de465e1623d1d280afe8 +oid sha256:0ae9eb46692bf1406cc7734c23c28a2f3f54dcebbcaaaa3ffc7d9cc70dcb0421 size 616603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 7e8ce90171..27eb1ffb2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b91cd4c4f0c362551f1cd8e1bb19f56844eb9d0968da2cb765b99cf7dcf5ca3 +oid sha256:94d4ddb471e43132431029483377f460644d00a2cbddc18b8c1e7eaba659f316 size 510817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index b87e1741c9..617868489c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5a5ad2e5f2b602874700eacbb39b4ed7ee00ddd7ec33271c565030e812ab88d +oid sha256:0761cffa7890a6388711d83e19241af9124699dc5e21d9d420dd80c8675f4e78 size 509339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index 201bcc6155..7e423e331c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21ab931b9929b7bbd8bdade974ce855a2c67b5c7ae5c11a56fdf27e4634b79aa +oid sha256:2be7b1704c74e61665d6e0cf4ed9507723e2c4898aee3ef3f7e2e749f2136265 size 420643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 821070fb37..36b0223aed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b48c8efb4dff8fd142020c4ee0b12e7e65f8fa0b73956cced20391be8dfa312 -size 565489 +oid sha256:35287683ef5bfb6fc84d3dae3048f13cf44975b34a033bcb0b130140dfde4cec +size 566277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 98586759ed..d822e3eee4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4786716284d95f263efe824653d8624ef4f64f15cb8b4fc4bb536dce41959aa7 +oid sha256:b28cdbd3965baca90314e8a300b9fca7f3d3beebac474f7adf8c434be8c5fbef size 564751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 0ff099da27..6f18b12c99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53dab9bcf2d5a14c5846073602fa8663aa2f23d60f072149a63ab997f3347105 +oid sha256:55566f111ec4778526f34ae6929c8a0cda03f276f217f6b3a4c2736dcbf87a8c size 444015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bdace06334..46f71a6c72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f60720d79c99cc1920a690736a0b9529508980b74e69798c90ffe65e149f785 +oid sha256:dde75a7610513f1c86e50ad563fa2a6ec0d5eb9e09344944cd76a5c86b556e96 size 443277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 65d6b0e89b..cbf2f009ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48c9487274aa28d1e9edb4ab5799a1fc64b5b082f4c48c4eaa6bd34f84a3789e +oid sha256:8671af1a3d15053f03a09c57267b10458852096d0a757b23206ee96e35fb72e6 size 665504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 424c9352d8..35611c2e27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ceba7bf33f2027b23dd0710eb18f98ab93751d2741be391369f100d71035f95 +oid sha256:12df43f2efb6f7637c32a79d9ec95c1b9a798108e8bb7b356b651a6c4590cf8b size 666836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 0ea7e07792..0cd52747f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c70ec73a02e18c71b056ee64a6e52d31f1e8041bce805e3f7ca2eea73dd1d42c +oid sha256:9bf6e56b6ab34e4f977384b4ede744382638a5cd98692852ef5953953709095e size 668020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index ba50eac776..0800a8bac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66f3508e20ef139fdf3c081d690f649b0bc182171580cf64fb50bcbc51952f56 +oid sha256:baad4752f98a79534cb3d5d33e3af0ee3b021634a243f701f85bcd8fb9e6faf4 size 444573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 2eb86e65a8..a75a2949ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3b48526d715bbab598d25451a628ccc2c59259d34502d1e323fd07c87050dce +oid sha256:f7c8635b4472f935d1f13938172f28a2932db32acdc0e1cc3822d156436c5e1d size 589469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 3150df3474..edbdb93b7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d15974f07dc2a315ff7fb5ac9047424722b214fccde95ee497b9d4597864ee93 +oid sha256:a06b82b027ed378afee531d3a4266283d1671049756930070155f2e43024e8f1 size 587941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 2a4e461626..a9cb598833 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2499dde80ed240d27e1e45aa71026f3669f56f82097c9903caf4196e90b768ff -size 469427 +oid sha256:34226d441881c0128dbf70ef6b9450597f3f0c4a092907dbe147cd0f27772a0e +size 470215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index fe1e8b7a7d..304ec3ef97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:129a3d975a15cabcb0bdca77421401d9fbd08041d61babc5fb4bace66c5f2900 -size 468737 +oid sha256:a71ad08152c428ae0482040f23775802a11f93f1143c71df226abc31b43685d3 +size 469527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 9fd05211c6..7c0e2fca1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a07054863e7a9c6645b6b7dd8f198ecb5090c89f782c18127a8efd7cc095eca9 -size 813536 +oid sha256:69450c09726f036ce91728a897a4881a2c5b549d5c301dbeb043011bd7399ceb +size 816742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 0a99b7a046..395fcaa184 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a872f4d2ef18201942fde6cde0bc37f549f59dbf4133ee93739b5e2a01111d8e -size 833372 +oid sha256:a2878644e5ba47285320ac03d6cdc7e987a5311a2082d871f1506a11e6fd1d37 +size 836578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index a001176870..b40156ec66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:375015ac38ac81fc5c37c138685b9450a852e4ec228cd05c90710423ba8b3e1e -size 817384 +oid sha256:29511469420393222738b28c8c936495a6dac9d14d4eb8def6e382b44029d378 +size 820542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 07c52f7fd3..fb972f5926 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5c012d75d1e1168155d4fddb7d692e75143e02282b32b0ab9ff6306cf6d01e6 -size 840674 +oid sha256:52caf03103a1d8782c35db9d265d7059461e86409eb7c4ebbff59596b32d17b7 +size 843830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9d2017bf90..a0156ce6e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9fff94435523983b034d9971d344867fa0e4eb30649ed2e217c2334aafc2568 -size 603001 +oid sha256:0bf6b694c7c32a694c209d461b5d4faf4c734921a6bea9647942193f7073a661 +size 603791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index afa39999f6..27df3d5dee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adf96061946e5e55ef16ebae468c3eee08804fbf11efa9985c69285edb4128b0 +oid sha256:216f7e20b54e5cb83156c37f7ba1f2dcbb236ed0c92b559bcedaba36c173bd56 size 481923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 77a91ac5fe..28ed184d57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab32a3475a32e89f6d189ad0977690dc59a56f0aede20ab3a53f5a86de92dbfa +oid sha256:94dabef72706a068ccbf08c2c7cbf97f29c3a1d49a6bea56e11365ee05136528 size 644690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e235095f57..bf44c63d51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:195f83814328481378c62b5ce75043af5441793c5a7c54a7bbf43155f0bd730f +oid sha256:6750789bf4caf4c3f40d66d3dcd32df96c076403320279b6fe53a18dce9fdf1f size 534563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 06b50bf708..a68d834546 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a82f5de5113000e53a202c8f772b2895991219327cfc7ac2d511589fbf565d7 -size 627180 +oid sha256:117cbe5255e3dd3a73edb74422e3a403f144a159705f5fdade0342882b01d835 +size 627970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 08c66bf0db..66230bd5f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18bcccfab2980910105c6b5225f67135b03bb029151b153bf10b9140b647032e +oid sha256:861bdd1d12c1bcb3062f76d607e6ebe8cca56328e74adbcb911cbceede17c3a0 size 505805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 667c29ac20..e0d94df9f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aadc338220168007d4d9bb2d373402dc59b0cf06a9c01b3a502de7804e1f050 +oid sha256:29bed1ab058039824ab7a3100fb0ca03a23fea5e54d0f0096fa4d9b1830fdcdf size 666894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 14c64df616..f71fd69288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc8782e8969f11793508f1bdf642e52268adc9fa4fedd04f625e7ee8c85f8f21 -size 557703 +oid sha256:3f94b24ee97406b75bfa3a50e797f19524df0c973cb8e552bc2ddce931790773 +size 558493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index e6bd47f8fe..3963a7350b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:961bfe5571d6ec2bdb456b58415ca73111f5cf1d10f4f17755f68c8a002458b8 -size 1084680 +oid sha256:6f57beae6249076affc7d3d4bd432ec07ef88279eb43199b7976d5e7520bdb91 +size 1091044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 284aecc676..1eaf9ab570 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:539778187a7d280db3233852e9512ca7e0c45d16d5b4315d681a1daf892817b2 -size 1116106 +oid sha256:6cd9462ed19090f9068d0773119a6a018ee9cd11442878a11501e7d92b35df2a +size 1122470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5bbace13b6..62237cd3a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:903d4cf50694dbfd301378f76aeb42f15940d9eec99623f86fdfd6ac8e7caa1a -size 641730 +oid sha256:f620824377db3b31dae5afc5d0a1871ad05f6c46d2c072d8498e63d28baf1f82 +size 642518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 064d9008e1..931b296b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7e778fc5a2740a51e8ce2fa5b6c47f4722b433d02d3e42a8d5dc044368328eb -size 520503 +oid sha256:36bfd0f660e8e8e2e86c579e0f10292cf5f9a22fd30e3fcff09f8f4148d97fbe +size 521291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e7a5d477c7..a68b3cd74d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d9d3cf6bd7f0922b7e68f1547c7d7f74f0ad56debbaf3f713a6cffc2f1fc356 -size 676214 +oid sha256:d5fc08e5a360a3b0b55bfac3bed479231ba8c35938ffff3ed4ef4d0398c05511 +size 677002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9dfe35850f..5490fe3c70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce860da62907fe5dc82d1110379fecefc77d347a326d6fa4b9882c746e3fd0a7 -size 573289 +oid sha256:fece909856fd73e56585ed76479f7e3b892c6f789236765232005324e559df06 +size 574079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f4dfece7fe..6e70153b24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e1a9cd96c370264ecabcfbe64c0c9574f66f4241e7387f849bc18d14c8be89b -size 665710 +oid sha256:16a4020be6c89f7c678a40231ae93610e80a533755938701f8dd5413d76b29ed +size 666498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 85513c380c..f11250f81d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d83dd7eacb0c7405144b5bf2e3036f0ac7805c09c6525e8132aec3974198d75 -size 544383 +oid sha256:af874ea43c5e0f19f15652bb8ad3750be3c6ed15e7fb418138ea3a4f66cd0307 +size 545173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f49ade37cc..e2b0fa16f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d82f2afee912681a9e5c571b4cbda2de40be9a9bff96011f88b268120ca418d -size 702266 +oid sha256:912471c07d65c9a25dc9d013a82f2b1f35270d0517b2c9b5c355439e792ce5bb +size 703054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e3681fbe7e..6734cecb38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edf14df8e7a85c1221b829fad1e5d276af364f449810e707fa13e099c6b8a60b -size 597171 +oid sha256:c753d4be5b5ad50d64023c181b483e31879cbdd488f31a06e0412b429815801d +size 597959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index 4c078a8a7b..d0cf0e738a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39b8cbb545b6a29c1c04273f53cee081930a4e46c0513e8d5100a66f54a143ea -size 739138 +oid sha256:01d78d7c9594737ccfa44cdc19de0c53809e1f1f3755cbd2641e379e5a465433 +size 740716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index b725ea3d9e..06b63bd161 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a55db32604e640f55c4e047a05bbfb1bdebf41009a096eec98524e1c26e99dd3 -size 759122 +oid sha256:1565b39b7f41fbcfc66868aa8b6eb86d74688fc214fc2ff94d627ee8b05f9cc3 +size 760750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index 2106b85d5f..4f8c7bacb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28fedf7eaf689fe5eb050f1986e793fce12c5bbde99891ce4ac2010ecec7f403 -size 777962 +oid sha256:9d21d83b5ca0567377c8297e0a7569c131770248fbb77a2606b8e30bb76f2731 +size 779542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index c8c8f6d70f..51e539ea6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:144aa27525eb42f35f8b33b3243f1223520f46407b776c86f96667474aa087b1 -size 800216 +oid sha256:d95bf2f1182b265f190cbc598f40ce1fa1cba121827caf0be580fe51598069a4 +size 801844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 538cd8094f..d1e7247446 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf78367c4241080dd94f4ca59c1e6a755ce46099526103f78e951cb3924e6fe2 +oid sha256:8f9b7e042213745dda950572b8299c80f6c83315a329c5bc1bf8dd9be296fb16 size 581487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bc2383f5e9..529867df7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12d14c45020d8231c1224d2e2bee29b00f27f39f47f0eecb75179f8cace042e4 +oid sha256:c827200c49e41b655518c673e8b2cb94fea4fa030e3b425bed0141ee6387c056 size 475505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 442482cf8e..8b02e00582 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3344df36d083d2aa56b7c49a30adfd9c0b8dc51fc203cc2a34c6a5a7cc01a90f +oid sha256:9799e98f8eff05a5b217f9e8856ca20adcb382105eedd764822f44dda79db491 size 680416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 22e9c00668..e57245de4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3096b60b1bd115ccf334004777e9ec335e710d7bef655dc7d1600bf193213e4a +oid sha256:6e056e4e032560e2c72174a11851c893caaf1bcddcee3110f59326467b8a0c4a size 533241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 1a8f5cf4c3..010e002dd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a02e783e142042cdc71aa21361bd88ae0a5fd212e3106204efef09ec5c114167 +oid sha256:0cd4103cf783f0022682596e7727c4a64299235c3a40620b9637c71a26cbcc7c size 607735 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 0b195d841a..5e5ab3468c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b78aa42b9e7caa9bc51750d07593ac450a53b9e1854c9db27b93054f5efe5762 -size 499189 +oid sha256:fe19cc48c91ae17f63bda512dd93f3b5d29f6c6b7abd2460e8068950b09860b3 +size 499977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6a15c5f8d2..2f372686cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bae0be88a78f668b527449bd1f0560b763dd6508bbff34d6305abbd73691e228 -size 702818 +oid sha256:5546a14cb40f9e5664a59fdb429216bdbb5c9b33a46fbeeabed25e9eabb2ff5f +size 703608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index df1cd85db0..aae9d43b3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d15c840702622b545a0501702d52738c97eb72facf20025c15b7c5b14aa199a +oid sha256:518a1fa23a7b2683f328319cccd852a741e692fb1cb78154ab8f8fe91de113ce size 557171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5d08da899e..c1dbe11b4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31bc0a34760c23659e59a68960f91d4453c9bff2cf5ff108742438a3963ddda2 -size 579365 +oid sha256:083a74bb7072ab6217188ae27c2b3bf200e5460713581bfb7192890af29f3f61 +size 580155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 07d3b6d673..1fc5636278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05862dccf2f673ea633d5c24e3581cf93cf6d965efdb031fd1f651eeaeac05ac +oid sha256:ebaa0e489826884ec48646ed5a5a877aa767cf5f70de7655b64613ba4935c22c size 468943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 47d61403f1..e629574fcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edb164e5075814fc74ea7f9237bbfaced49315f9db24a6518cb6c6d25e6cb58e +oid sha256:32ca8c7ce61fad07f604d44580450daa1d42c7e9f7406dbf96119a2eed9123a2 size 671488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 7b2cc731d8..bf02321b10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7925861a29872465e7e249919e03880e1cf73d51ac4c9014122ce5c2d0968918 +oid sha256:641c89de87f7e92435464407ead2f96421cd389aacea5ff46eaf404d4cb2e8bd size 527813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6a4eed8a28..08ff8ea7f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0919d7b2528eefeeb9a1d21c91cab286e3db259b63fe344ee1e2768da0b2c0c1 +oid sha256:f03bc77973a9987a12fe120cbbc71341077264203b640d0a74802a37d3632a7d size 603641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 97269c294f..b349b97727 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14f731d607a8e77d0ed4214a290dad09e28699817710dc9c2d1d14045938cac2 +oid sha256:7e3c5d050bdc41f37d8963211c8277cc7500d3653048ad6a9c6bff25a9e149b7 size 499731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index eefa346522..cf8ee034c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66c9ba6f56d0dfafbdfd4377b0c49939a0a9c44bb8218822bbbaae5535fd9b34 +oid sha256:5aaf676bb6579c8a2115df80039d93c62971cedb6ec80720003c2205e2c8e5cf size 698724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 07e94c237d..1b10f9b169 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dc20d2c4effd3414f86f2657b62253d669c94ec044645d9389049c2462f95d3 +oid sha256:0d14e4b959a5cbdf6a735554cb10ed373aa543733ba07ce42d23713863ed8c89 size 558749 From ef4ea955b20cdf846355fa3dc8fac325a6a2f74e Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Mon, 15 Dec 2025 20:20:53 +0800 Subject: [PATCH 148/172] [None] [fix] Fix slrum scripts (#10007) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../slurm/benchmark/disaggr_torch.slurm | 2 ++ .../slurm/benchmark/gen_server_config.py | 6 ++--- .../disaggregated/slurm/benchmark/submit.py | 27 +++++++++++-------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index e811290230..4e34e30595 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -16,6 +16,7 @@ while [[ $# -gt 0 ]]; do --benchmark-ratio) benchmark_ratio="$2"; shift 2 ;; --streaming) streaming="$2"; shift 2 ;; --use-nv-sa-benchmark) use_nv_sa_benchmark="$2"; shift 2 ;; + --benchmark-mode) benchmark_mode="$2"; shift 2 ;; # Environment and paths --dataset-file) dataset_file="$2"; shift 2 ;; @@ -59,6 +60,7 @@ echo " multi_round: ${multi_round}" echo " benchmark_ratio: ${benchmark_ratio}" echo " streaming: ${streaming}" echo " use_nv_sa_benchmark: ${use_nv_sa_benchmark}" +echo " benchmark_mode: ${benchmark_mode}" echo echo "Environment Configuration:" echo " dataset_file: ${dataset_file}" diff --git a/examples/disaggregated/slurm/benchmark/gen_server_config.py b/examples/disaggregated/slurm/benchmark/gen_server_config.py index fcab212b42..c613b13836 100644 --- a/examples/disaggregated/slurm/benchmark/gen_server_config.py +++ b/examples/disaggregated/slurm/benchmark/gen_server_config.py @@ -78,10 +78,8 @@ if __name__ == "__main__": 'port': args.server_port, 'backend': 'pytorch', 'context_servers': { - 'num_instances': - 0 if gen_only else args.num_ctx_servers, - 'urls': [] if gen_only else - [f'{host}:{args.worker_port}' for host in ctx_hostnames] + 'num_instances': 0 if gen_only else args.num_ctx_servers, + 'urls': [] if gen_only else ctx_urls }, 'generation_servers': { 'num_instances': args.num_gen_servers, diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 881b08664a..9aec0b689d 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -26,10 +26,10 @@ def parse_args(): '--dir', type=str, help='Directory containing YAML configuration files') - group.add_argument('--log-dir', - type=str, - default=None, - help='Log directory') + parser.add_argument('--log-dir', + type=str, + default=None, + help='Log directory') return parser.parse_args() @@ -154,16 +154,20 @@ def submit_job(config, log_dir): {}).get('num_nextn_predict_layers', 0) # Calculate nodes based on world sizes - ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size'] - ctx_cp_size = config['worker_config']['ctx']['context_parallel_size'] - ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size'] + ctx_tp_size = config['worker_config']['ctx'].get('tensor_parallel_size', 1) + ctx_cp_size = config['worker_config']['ctx'].get('context_parallel_size', 1) + ctx_pp_size = config['worker_config']['ctx'].get('pipeline_parallel_size', + 1) ctx_world_size = ctx_tp_size * ctx_cp_size * ctx_pp_size ctx_nodes = calculate_nodes(ctx_world_size, ctx_num, gpus_per_node) - gen_tp_size = config['worker_config']['gen']['tensor_parallel_size'] - gen_cp_size = config['worker_config']['gen']['context_parallel_size'] - gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size'] + + gen_tp_size = config['worker_config']['gen'].get('tensor_parallel_size', 1) + gen_cp_size = config['worker_config']['gen'].get('context_parallel_size', 1) + gen_pp_size = config['worker_config']['gen'].get('pipeline_parallel_size', + 1) gen_world_size = gen_tp_size * gen_cp_size * gen_pp_size gen_nodes = calculate_nodes(gen_world_size, gen_num, gpus_per_node) + total_nodes = ctx_nodes + gen_nodes total_tasks = total_nodes * gpus_per_node @@ -259,7 +263,7 @@ def submit_job(config, log_dir): str(allocation["port"]), config['benchmark']['mode'], config['benchmark']['concurrency_list'], - str(slurm_config['numa_bind']), + str(slurm_config['numa_bind']).lower(), log_dir, str(profiling_config['nsys_on']).lower(), profiling_config['gen_profile_range'] @@ -303,6 +307,7 @@ def submit_job(config, log_dir): '--benchmark-ratio', str(config['benchmark']['benchmark_ratio']), '--streaming', str(config['benchmark']['streaming']).lower(), '--use-nv-sa-benchmark', str(config['benchmark']['use_nv_sa_benchmark']).lower(), + '--benchmark-mode', config['benchmark']['mode'], # Environment and paths '--dataset-file', config['benchmark']['dataset_file'], From 9e7182b6038207d3d6c4ef1094cbca6dc582ed29 Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Mon, 15 Dec 2025 21:08:53 +0800 Subject: [PATCH 149/172] [TRTLLM-9615][feat] Implement a distributed tuning system (#9621) Four distinct strategies are implemented to accommodate different distributed tuning scenarios, including BROADCAST, INDEPENDENT, MERGE, PARALLEL. * Distributed tuning is disabled by default, with the INDEPENDENT strategy as the fallback. This conservative approach prevents unexpected behavior in standard use cases. * Only operations with significant tuning time overhead have been assigned the PARALLEL strategy, which allows the same tensor parallelism (TP) rank to tune tactics concurrently across different ranks. This targeted approach balances performance gains with stability. * Operations with nested tuning structures, such as NVFP4GemmUnifiedRunner, currently support only the INDEPENDENT strategy. This restriction exists because the synchronization mechanism is optimized only for leaf operations and doesn't yet handle nested hierarchies. Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- examples/layer_wise_benchmarks/run.py | 2 +- tensorrt_llm/_torch/autotuner.py | 246 ++++++++++++++---- .../_torch/custom_ops/cute_dsl_custom_ops.py | 6 +- .../_torch/custom_ops/torch_custom_ops.py | 9 +- .../custom_ops/trtllm_gen_custom_ops.py | 41 ++- .../_torch/pyexecutor/model_engine.py | 5 +- tests/unittest/_torch/misc/test_autotuner.py | 130 ++++++++- 7 files changed, 362 insertions(+), 77 deletions(-) diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py index 8e590dec44..c1e3ab5133 100644 --- a/examples/layer_wise_benchmarks/run.py +++ b/examples/layer_wise_benchmarks/run.py @@ -206,7 +206,7 @@ for autotune_flag, batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [ if autotune_flag: if args.enable_autotuner: cache_path = os.getenv("TLLM_AUTOTUNER_CACHE_PATH") or None - with autotune(cache_path=cache_path, rank=rank): + with autotune(cache_path=cache_path): run_pack() if args.run_type == "GEN": logger.info("Layer-wise benchmarks: Prefill KV cache") diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 748b6cbe04..679ce2ad82 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -1,6 +1,7 @@ import ast import contextlib import copy +import enum import inspect import itertools import json @@ -16,8 +17,25 @@ import torch from cuda.bindings import driver import tensorrt_llm +from tensorrt_llm._torch.distributed import Distributed from tensorrt_llm.bindings.internal.runtime import delay_kernel from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping + + +class DistributedTuningStrategy(enum.Enum): + """ + Strategy for distributed tuning. + Args: + BROADCAST: One rank (rank 0) tunes and broadcasts results to others + INDEPENDENT: Each rank tunes independently (default for non-comm ops) + MERGE: All ranks participate in tuning and reach merge + PARALLEL: All ranks participate in tuning with partial tactics + """ + BROADCAST = "broadcast" + INDEPENDENT = "independent" + MERGE = "merge" + PARALLEL = "parallel" @dataclass(slots=True, unsafe_hash=True) @@ -99,6 +117,7 @@ class TuningConfig: This flag is to create circular buffer of input tensors to avoid L2 cache hits to simulate cold L2 cache. Notice that not all tuning processes can benefit from this feature. use_cuda_graph (bool): Whether to use CUDA graph for the tuning process. + distributed_tuning_strategy (DistributedTuningStrategy): Strategy for distributed tuning. """ dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...] = () constraint_specs: Tuple[ConstraintSpec, ...] = () @@ -106,6 +125,7 @@ class TuningConfig: inputs_pre_hook: Callable = None use_cold_l2_cache: bool = False use_cuda_graph: bool = True + distributed_tuning_strategy: DistributedTuningStrategy = DistributedTuningStrategy.INDEPENDENT @dataclass(unsafe_hash=True) @@ -229,7 +249,16 @@ class TunableRunner(ABC): @contextlib.contextmanager -def autotune(tune_mode: bool = True, cache_path: str = None, rank: int = 0): +def autotune(tune_mode: bool = True, cache_path: str = None): + """Context manager for autotuning with distributed support. + + Args: + tune_mode: Whether to enable tuning mode + cache_path: Path to save/load cache files + """ + autotuner = AutoTuner.get() + rank = autotuner.mapping.rank + # if cache_path is provided, use the rank-specific file tune_required = tune_mode if cache_path is not None: @@ -242,25 +271,27 @@ def autotune(tune_mode: bool = True, cache_path: str = None, rank: int = 0): if file_exists: logger.info( f"[Autotuner] Loading cache from {cache_path_no_ext_rank}") - AutoTuner.get().profiling_cache.load_cache(cache_path_no_ext_rank) + autotuner.profiling_cache.load_cache(cache_path_no_ext_rank) # record the old tuning mode - old_mode = AutoTuner.get().is_tuning_mode - AutoTuner.get().is_tuning_mode = tune_required + old_mode = autotuner.is_tuning_mode + autotuner.is_tuning_mode = tune_required autotune_enabled = tune_required and not old_mode + if autotune_enabled: logger.info("[Autotuner] Autotuning process starts ...") + try: yield finally: - AutoTuner.get().is_tuning_mode = old_mode + autotuner.is_tuning_mode = old_mode if autotune_enabled: logger.info("[Autotuner] Autotuning process ends") # save cache if cache_path is not None: logger.info(f"[Autotuner] Saving cache to {cache_path_no_ext_rank}") - AutoTuner.get().profiling_cache.save_cache(cache_path_no_ext_rank) + autotuner.profiling_cache.save_cache(cache_path_no_ext_rank) @dataclass @@ -399,6 +430,9 @@ class AutoTunerProfilingCache: ), ) + def merge_cache_data(self, cache_data: Dict[str, Any]): + self.cache.update(cache_data) + def get_specific_custom_op(self, custom_op: str) -> Dict[Tuple, Tuple]: return {k: v for k, v in self.cache.items() if k[0] == custom_op} @@ -561,6 +595,11 @@ class AutoTuner: _instance = None def __init__(self, warmup=2, repeat=10, stream_delay_micro_secs=1000): + # Increase log level for AutoTuner associated logger` + self._log_level_to_info = os.getenv( + "TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO", '0') == '1' + self._debug_logger = logger.info if self._log_level_to_info else logger.debug + self.repeat = repeat self.warmup = warmup self.stream_delay_micro_secs = stream_delay_micro_secs @@ -575,10 +614,9 @@ class AutoTuner: # Last captured choose_one() contexts self._last_capture: Optional['AutoTuner.TacticsCapture'] = None - # Increase log level for AutoTuner associated logger - self._log_level_to_info = os.getenv( - "TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO", '0') == '1' - self._debug_logger = logger.info if self._log_level_to_info else logger.debug + # Dsitributed tuning state + self._dist: Optional[Distributed] = None + self.mapping: Mapping = Mapping() @classmethod def get(cls): @@ -586,6 +624,9 @@ class AutoTuner: cls._instance = AutoTuner() return cls._instance + def set_mapping(self, mapping: Mapping = None): + self.mapping = mapping + class TacticsCapture: """Object returned by capture() that can be iterated to get all tactic combinations. @@ -768,42 +809,26 @@ class AutoTuner: self.stats.tuned_op_profiled_configs[custom_op] = 0 if custom_op not in self.stats.failed_profiling_count: self.stats.failed_profiling_count[custom_op] = set() - new_tuning_failure_occured = False + new_tuning_failure_occurred = False - for p in profiles: - tensors = self._prepare_input_tensors(p, inputs) - is_cache_hit, *_ = self.profiling_cache.search_cache( - custom_op, runners, p.get_opt_shapes(), tuning_config) - if not is_cache_hit: - # Initialize runner and tactic as None in case of no valid tactic or runners are found - best_runner_id, best_tactic, min_time, has_tuning_failure_occured = self._profile_runners( - custom_op, runners, tensors, p, tuning_config, **kwargs) - if best_runner_id is not None: - # At least one valid (runner, tactic) pair is found - cache_key = self.profiling_cache.get_cache_key( - custom_op, runners[best_runner_id], p.get_opt_shapes(), - tuning_config) + # Synchronize ranks before profiling + if self._should_current_rank_tune( + tuning_config.distributed_tuning_strategy): + for p in profiles: + tensors = self._prepare_input_tensors(p, inputs) + is_cache_hit, *_ = self.profiling_cache.search_cache( + custom_op, runners, p.get_opt_shapes(), tuning_config) + if not is_cache_hit: + # Initialize runner and tactic as None in case of no valid tactic or runners are found + best_runner_id, best_tactic, min_time, has_tuning_failure_occurred = self._profile_runners( + custom_op, runners, tensors, p, tuning_config, **kwargs) + new_tuning_failure_occurred = new_tuning_failure_occurred or has_tuning_failure_occurred - self._debug_logger( - f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}." - ) - # inspect call stack - self.profiling_cache[cache_key] = (best_runner_id, - best_tactic, min_time) - - self.stats.tuned_op_profiled_configs[custom_op] += 1 - else: - logger.warning_once( - f"[Autotuner] No valid runner/tactic was found for custom_op={custom_op}, input_shapes={input_shapes}. " - f"At least one valid (runner, tactic) pair is required. " - f"If get_valid_tactics is intended to return empty list, please ensure that this profile is not valid for the custom_op " - f"and should not occurs during the inference stage, or fallback tactic is implemented. Otherwise, the the tuning process will crash.", - key=(custom_op, "warning_autotuning_no_valid_tactic"), - ) - new_tuning_failure_occured = new_tuning_failure_occured or has_tuning_failure_occured + self._maybe_sync_cache_data(tuning_config.distributed_tuning_strategy, + custom_op) # If failed profiling tactics occurs, log the error. - if new_tuning_failure_occured: + if new_tuning_failure_occurred: logger.warning_once( f"[Autotuner] New tuning error occurs:" f"Total failed profiling tactics occurs: {len(self.stats.failed_profiling_count[custom_op])} for custom_op={custom_op}. " @@ -834,7 +859,7 @@ class AutoTuner: **kwargs, ) -> float: min_time = float('inf') - has_tuning_failure_occured = False + has_tuning_failure_occurred = False best_runner_id, best_tactic = None, None # If the inputs_pre_hook is provided, it will be called before profiling. if tuning_config.inputs_pre_hook is not None: @@ -845,8 +870,11 @@ class AutoTuner: p.name for p in inspect.signature(runner.forward).parameters.values() } - valid_tactics = runner.get_valid_tactics(input_tensors, profile, - **kwargs) + all_valid_tactics = runner.get_valid_tactics( + input_tensors, profile, **kwargs) + + valid_tactics = self._maybe_parallelize_tactics( + all_valid_tactics, tuning_config.distributed_tuning_strategy) if "do_preparation" in runner_arg_names and len(valid_tactics) > 0: runner( input_tensors, @@ -882,12 +910,36 @@ class AutoTuner: # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics # or some runtime error occurs during profiling. time_measured = float('inf') - has_tuning_failure_occured = True + has_tuning_failure_occurred = True if time_measured < min_time: min_time = time_measured best_runner_id, best_tactic = runner_id, tac - return best_runner_id, best_tactic, min_time, has_tuning_failure_occured + if best_runner_id is not None: + # At least one valid (runner, tactic) pair is found + cache_key = self.profiling_cache.get_cache_key( + custom_op, runners[best_runner_id], profile.get_opt_shapes(), + tuning_config) + + self._debug_logger( + f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}." + ) + # inspect call stack + # TODO: use named tuple to make it more readable + self.profiling_cache[cache_key] = (best_runner_id, best_tactic, + min_time) + + self.stats.tuned_op_profiled_configs[custom_op] += 1 + else: + logger.warning_once( + f"[Autotuner] No valid runner/tactic was found for custom_op={custom_op}, input_shapes={profile.get_opt_shapes()}. " + f"At least one valid (runner, tactic) pair is required. " + f"If get_valid_tactics is intended to return empty list, please ensure that this profile is not valid for the custom_op " + f"and should not occurs during the inference stage, or fallback tactic is implemented. Otherwise, the the tuning process will crash.", + key=(custom_op, "warning_autotuning_no_valid_tactic"), + ) + + return best_runner_id, best_tactic, min_time, has_tuning_failure_occurred def _get_input_sizes(self, inputs: List[torch.Tensor]) -> List[torch.Size]: @@ -1358,3 +1410,103 @@ class AutoTuner: return nvrtc.nvrtcGetErrorString(error)[1] else: raise RuntimeError("Unknown error type: {}".format(error)) + + def setup_distributed_state(self, mapping: Mapping, dist: Distributed): + """Setup distributed communication state for autotuning.""" + self.mapping = mapping + self._dist = dist + self._debug_logger( + f"[AutoTuner] Whether using distributed tuning: {self._is_distributed()}" + ) + + def _is_distributed(self) -> bool: + """Check if we're in a distributed environment.""" + return self.mapping is not None and self.mapping.tp_size > 1 and self._dist is not None + + def _maybe_parallelize_tactics( + self, all_valid_tactics: List[Any], + strategy: DistributedTuningStrategy) -> List[Any]: + """Parallelize tactics across all TP ranks if strategy is PARALLEL.""" + if strategy == DistributedTuningStrategy.PARALLEL: + # only distribute across TP ranks + # each TP rank will only tune the tactics that are assigned to it + tp_size = self.mapping.tp_size + tp_rank = self.mapping.tp_rank + valid_tactics = [] + for idx, tactic in enumerate(all_valid_tactics): + if idx % tp_size == tp_rank: + valid_tactics.append(tactic) + return valid_tactics + else: + return all_valid_tactics + + def _maybe_sync_cache_data(self, strategy: DistributedTuningStrategy, + custom_op: str): + """Synchronize cache data across all ranks.""" + if not self._is_distributed(): + logger.warning( + f"[AutoTuner] Not in distributed environment, skipping synchronization" + ) + return + + if strategy == DistributedTuningStrategy.BROADCAST: + self._broadcast_cache_data(custom_op) + elif strategy == DistributedTuningStrategy.INDEPENDENT: + return + elif strategy == DistributedTuningStrategy.MERGE: + self._merge_cache_data(custom_op) + elif strategy == DistributedTuningStrategy.PARALLEL: + self._merge_cache_data(custom_op) + else: + logger.error( + f"[AutoTuner] Unknown distributed tuning strategy: {strategy}, falling back to independent" + ) + return + + def _merge_cache_data(self, custom_op: str): + cache_data = self.profiling_cache.get_specific_custom_op(custom_op) + merged_cache_data = dict() + all_cache_data = self._dist.tp_allgather(obj=cache_data) + + for data in all_cache_data: + for key, value in data.items(): + current_time = merged_cache_data.get(key, [ + float('inf'), + ])[-1] + if value[-1] < current_time: + merged_cache_data[key] = value + + self.profiling_cache.merge_cache_data(merged_cache_data) + + def _broadcast_cache_data( + self, + custom_op: str, + ) -> None: + """Broadcast tactics from root rank to all other ranks.""" + cache_data = self.profiling_cache.get_specific_custom_op(custom_op) + root = 0 + cache_data = self._dist.tp_broadcast(obj=cache_data, root=root) + + self.profiling_cache.merge_cache_data(cache_data) + + def _should_current_rank_tune(self, + strategy: DistributedTuningStrategy) -> bool: + """Determine if this rank should perform tuning based on strategy.""" + if not self._is_distributed(): + return True + + if strategy == DistributedTuningStrategy.BROADCAST: + # Only rank 0 tunes + return self.mapping.rank == 0 + elif strategy in { + DistributedTuningStrategy.INDEPENDENT, + DistributedTuningStrategy.MERGE, + DistributedTuningStrategy.PARALLEL, + }: + # All ranks tune independently + return True + else: + logger.error( + f"[AutoTuner] Unknown distributed tuning strategy: {strategy}, falling back to independent" + ) + return True diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py index d497ace49b..1b072eba48 100644 --- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py @@ -7,8 +7,9 @@ from tensorrt_llm.logger import logger from ..._utils import get_sm_version from ...math_utils import pad_up -from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec, - OptimizationProfile, TunableRunner, TuningConfig) +from ..autotuner import (AutoTuner, ConstraintSpec, DistributedTuningStrategy, + DynamicTensorSpec, OptimizationProfile, TunableRunner, + TuningConfig) from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE from ..utils import (fp4_scale_infer_shape, get_last_power_of_2_num_tokens_buckets, @@ -364,6 +365,7 @@ if IS_CUTLASS_DSL_AVAILABLE: last_positive_power_of_2), ), constraint_specs=(ConstraintSpec(2, 0, fp4_scale_infer_shape), ), use_cold_l2_cache=True, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL, ) def __init__(self, diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index fe09758cfe..d338f61145 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -10,8 +10,9 @@ from tensorrt_llm import deep_gemm from tensorrt_llm._utils import get_sm_version from tensorrt_llm.logger import logger -from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec, - OptimizationProfile, TunableRunner, TuningConfig) +from ..autotuner import (AutoTuner, ConstraintSpec, DistributedTuningStrategy, + DynamicTensorSpec, OptimizationProfile, TunableRunner, + TuningConfig) from ..cublaslt_utils import IS_CUBLASLT_AVAILABLE from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE from ..modules.multi_stream_utils import do_multi_stream @@ -35,6 +36,7 @@ class MoERunner(TunableRunner): 0, 0, get_last_power_of_2_num_tokens_buckets, last_positive_power_of_2), ), tune_max_num_tokens=8192, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL, ) def __init__( @@ -103,11 +105,8 @@ class MoERunner(TunableRunner): self.output_dtype, self.top_k, self.tp_size, - self.tp_rank, self.ep_size, - self.ep_rank, self.cluster_size, - self.cluster_rank, self.enable_alltoall, self.use_deepseek_fp8_block_scale, self.use_w4_group_scaling, diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py index a8236d88fc..f3918d0aa2 100644 --- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py @@ -11,8 +11,9 @@ from tensorrt_llm._torch.utils import (Fp4QuantizedTensor, fp4_utils, last_positive_power_of_2, next_positive_power_of_2) -from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec, - OptimizationProfile, TunableRunner, TuningConfig) +from ..autotuner import (AutoTuner, ConstraintSpec, DistributedTuningStrategy, + DynamicTensorSpec, OptimizationProfile, TunableRunner, + TuningConfig) def prepare_dummy_topk_and_hook( @@ -345,8 +346,10 @@ class FP4BlockScaleMoERunner(TunableRunner): dynamic_tensor_specs = cls.get_dynamic_tensor_specs() constraint_specs = cls.get_constraint_specs() - tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, - constraint_specs=constraint_specs) + tuning_config = TuningConfig( + dynamic_tensor_specs=dynamic_tensor_specs, + constraint_specs=constraint_specs, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL) return tuning_config @@ -667,8 +670,10 @@ class FP8BlockScaleMoERunner(TunableRunner): dynamic_tensor_specs = cls.get_dynamic_tensor_specs() constraint_specs = cls.get_constraint_specs() - tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, - constraint_specs=constraint_specs) + tuning_config = TuningConfig( + dynamic_tensor_specs=dynamic_tensor_specs, + constraint_specs=constraint_specs, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL) return tuning_config @@ -966,8 +971,10 @@ class MxE4m3MxE2m1BlockScaleMoERunner(TunableRunner): dynamic_tensor_specs = cls.get_dynamic_tensor_specs() constraint_specs = cls.get_constraint_specs() - tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, - constraint_specs=constraint_specs) + tuning_config = TuningConfig( + dynamic_tensor_specs=dynamic_tensor_specs, + constraint_specs=constraint_specs, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL) return tuning_config @@ -1237,8 +1244,10 @@ class E4m3MxE2m1BlockScaleMoERunner(TunableRunner): dynamic_tensor_specs = cls.get_dynamic_tensor_specs() constraint_specs = cls.get_constraint_specs() - tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, - constraint_specs=constraint_specs) + tuning_config = TuningConfig( + dynamic_tensor_specs=dynamic_tensor_specs, + constraint_specs=constraint_specs, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL) return tuning_config @@ -1506,8 +1515,10 @@ class Bf16MxE2m1BlockScaleMoERunner(TunableRunner): dynamic_tensor_specs = cls.get_dynamic_tensor_specs() constraint_specs = cls.get_constraint_specs() - tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, - constraint_specs=constraint_specs) + tuning_config = TuningConfig( + dynamic_tensor_specs=dynamic_tensor_specs, + constraint_specs=constraint_specs, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL) return tuning_config @@ -1764,8 +1775,10 @@ class FP8FP4BlockScaleMoERunner(TunableRunner): dynamic_tensor_specs = cls.get_dynamic_tensor_specs() constraint_specs = cls.get_constraint_specs() - tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, - constraint_specs=constraint_specs) + tuning_config = TuningConfig( + dynamic_tensor_specs=dynamic_tensor_specs, + constraint_specs=constraint_specs, + distributed_tuning_strategy=DistributedTuningStrategy.PARALLEL) return tuning_config diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 10054bee8c..7574b8f6fd 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -626,7 +626,7 @@ class PyTorchModelEngine(ModelEngine): """Runs a forward pass to populate the autotuner cache.""" if not self.llm_args.enable_autotuner: return - + AutoTuner.get().setup_distributed_state(self.mapping, self.dist) logger.info("Running autotuner warmup...") kv_cache_manager = resource_manager.get_resource_manager( self.kv_cache_manager_key) @@ -636,8 +636,7 @@ class PyTorchModelEngine(ModelEngine): self.batch_size * (self.max_seq_len - 1)) cache_path = os.environ.get("TLLM_AUTOTUNER_CACHE_PATH", None) - with self.no_cuda_graph(), autotune(cache_path=cache_path, - rank=self.mapping.rank): + with self.no_cuda_graph(), autotune(cache_path=cache_path): warmup_request = self._create_warmup_request( resource_manager, curr_max_num_tokens, 0) with self._release_batch_context(warmup_request, diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py index 2323d0ac98..a6116d544f 100644 --- a/tests/unittest/_torch/misc/test_autotuner.py +++ b/tests/unittest/_torch/misc/test_autotuner.py @@ -1,20 +1,38 @@ import itertools import os +import pickle +import sys import tempfile from typing import Any, List +import cloudpickle +import pytest import torch +from mpi4py import MPI +import tensorrt_llm import tensorrt_llm._torch.autotuner as autotuner -from tensorrt_llm._torch.autotuner import (AutoTuner, DynamicDim, - DynamicTensorSpec, FakeTensor, - OptimizationProfile, StaticDim, - TunableRunner, TuningConfig, - autotune) +from tensorrt_llm._torch.autotuner import (AutoTuner, DistributedTuningStrategy, + DynamicDim, DynamicTensorSpec, + FakeTensor, OptimizationProfile, + StaticDim, TunableRunner, + TuningConfig, autotune) from tensorrt_llm._torch.utils import (get_power_of_2_num_tokens_buckets, next_positive_power_of_2) from tensorrt_llm.bindings.internal.runtime import delay_kernel from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +cloudpickle.register_pickle_by_value(sys.modules[__name__]) +MPI.pickle.__init__( + cloudpickle.dumps, + cloudpickle.loads, + pickle.HIGHEST_PROTOCOL, +) + +# needed since we reuse the mpi executor pool, first test running will leak a thread +pytestmark = pytest.mark.threadleak(enabled=False) def test_multi_dynamic_dims(): @@ -599,3 +617,105 @@ def test_kernel_testing_mismatched_ops(): assert "Custom op mismatch" in error_msg, f"Expected 'Custom op mismatch' in error message, got: {error_msg}" assert "test_op_A" in error_msg, f"Expected 'test_op_A' in error message, got: {error_msg}" assert "test_op_B" in error_msg, f"Expected 'test_op_B' in error message, got: {error_msg}" + + +class DistributedGemmRunner(TunableRunner): + + def __init__(self, prefer_tactics: List[int] = [0, 1]): + self.prefer_tactics = prefer_tactics + + def get_valid_tactics(self, inputs, profile, **kwargs): + # Return all tactics so merge strategy can choose between them + return self.prefer_tactics + + def forward(self, inputs, *, tactic=-1, **kwargs): + # tactic 0 is slower + if tactic % 2 == 0: + for _ in range(5): + inputs[0] @ inputs[1] + return inputs[0] @ inputs[1] + + def unique_id(self): + return () + + +def _distributed_worker_function(world_size, strategy): + """Worker function to run on each MPI rank.""" + rank = tensorrt_llm.mpi_rank() + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=world_size, + pp_size=1) + tuner = AutoTuner.get() + tuner.clear_cache() + tuner.setup_distributed_state(mapping) + + x = torch.randn(16, 32, device='cuda') + w = torch.randn(32, 64, device='cuda') + inputs = [x, w] + + if strategy == DistributedTuningStrategy.PARALLEL: + # All ranks get the same set of tactics + prefer_tactics = [0, 1, 2, 3] + else: + # Each rank prefers different tactics + prefer_tactics = [rank] + runner = DistributedGemmRunner(prefer_tactics=prefer_tactics) + config = TuningConfig(distributed_tuning_strategy=strategy) + + cache_path = os.environ.get("TLLM_AUTOTUNER_CACHE_PATH", None) + with autotune(tune_mode=True, cache_path=cache_path): + tuner.choose_one(custom_op=f"test_distributed_{strategy}", + runners=[runner], + tuning_config=config, + inputs=inputs) + selected_runner, best_tactic = tuner.choose_one( + custom_op=f"test_distributed_{strategy}", + runners=[runner], + tuning_config=config, + inputs=inputs) + + if strategy == DistributedTuningStrategy.BROADCAST: + # All ranks should select tactic 0 + assert best_tactic == 0 + elif strategy == DistributedTuningStrategy.INDEPENDENT: + # Each rank should select the tactic it prefers + assert best_tactic == rank + elif strategy == DistributedTuningStrategy.MERGE: + # Because tactic 0 is slower, two ranks should always select tactic 1 + assert best_tactic == 1 + elif strategy == DistributedTuningStrategy.PARALLEL: + # Tactic 1 or 3 should be selected since they are faster. + # TODO: This might not cover the case that rank1 tunes nothing + assert best_tactic % 2 == 1 + else: + assert False, f"Unknown strategy: {strategy}" + + return True + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Requires at least 2 GPUs for this test") +@pytest.mark.parametrize( + "strategy", + [ + DistributedTuningStrategy.BROADCAST, + DistributedTuningStrategy.INDEPENDENT, + DistributedTuningStrategy.MERGE, + DistributedTuningStrategy.PARALLEL, + ], +) +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_distributed_broadcast_strategy(strategy, mpi_pool_executor): + """Test broadcast strategy with real MPI processes.""" + world_size = 2 + # Use MPIPoolExecutor to run distributed test + results = mpi_pool_executor.map( + _distributed_worker_function, + *zip(*[( + world_size, + strategy, + )] * world_size), + ) + for r in results: + assert r is True From 3230fbe79a6d94a5a4398451cc36e02b51f81229 Mon Sep 17 00:00:00 2001 From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Date: Mon, 15 Dec 2025 21:39:37 +0800 Subject: [PATCH 150/172] [None][feat] Update reasoning parser for nano-v3 (#9944) Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> --- tensorrt_llm/llmapi/reasoning_parser.py | 17 +++- tensorrt_llm/serve/postprocess_handlers.py | 9 ++- .../unittest/llmapi/test_reasoning_parser.py | 78 +++++++++++++++++++ 3 files changed, 100 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/llmapi/reasoning_parser.py b/tensorrt_llm/llmapi/reasoning_parser.py index 64e7d0fc64..6ea24fecef 100644 --- a/tensorrt_llm/llmapi/reasoning_parser.py +++ b/tensorrt_llm/llmapi/reasoning_parser.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Type +from typing import Any, Optional, Type @dataclass @@ -109,15 +109,28 @@ class ReasoningParserFactory: parsers: dict[str, Type[BaseReasoningParser]] = { "deepseek-r1": DeepSeekR1Parser, "qwen3": DeepSeekR1Parser, + "nano-v3": DeepSeekR1Parser, } @staticmethod - def create_reasoning_parser(reasoning_parser: str) -> BaseReasoningParser: + def create_reasoning_parser( + reasoning_parser: str, + chat_template_kwargs: Optional[dict[str, Any]] = None + ) -> BaseReasoningParser: try: reasoning_parser_class = ReasoningParserFactory.parsers[ reasoning_parser.lower()] if reasoning_parser == "deepseek-r1": return reasoning_parser_class(reasoning_at_start=True) + elif reasoning_parser == "nano-v3": + # Note: If the model is with reasoning (default behavior), `reasoning_at_start` should be True, and the starting response should be parsed into `reasoning_content`. + # While the model is without reasoning, `reasoning_at_start` should be False to parse the response into `content` fields. + is_reasoning_model = True + if isinstance(chat_template_kwargs, dict): + is_reasoning_model = chat_template_kwargs.get( + "enable_thinking", True) + return reasoning_parser_class( + reasoning_at_start=is_reasoning_model) return reasoning_parser_class() except KeyError as e: raise ValueError( diff --git a/tensorrt_llm/serve/postprocess_handlers.py b/tensorrt_llm/serve/postprocess_handlers.py index 8a9c203805..aa56cc6e5b 100644 --- a/tensorrt_llm/serve/postprocess_handlers.py +++ b/tensorrt_llm/serve/postprocess_handlers.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import List, Literal, Optional, Tuple, Union +from typing import Any, List, Literal, Optional, Tuple, Union from .._utils import nvtx_range_debug from ..executor import (DetokenizedGenerationResultBase, GenerationResult, @@ -55,6 +55,7 @@ class ChatPostprocArgs(PostprocArgs): tool_parser_dict: dict[int, BaseToolParser] = field(default_factory=dict) has_tool_call: dict[int, bool] = field(default_factory=dict) tool_call_id_type: str = "random" + chat_template_kwargs: Optional[dict[str, Any]] = None @classmethod def from_request(cls, request: ChatCompletionRequest): @@ -69,6 +70,7 @@ class ChatPostprocArgs(PostprocArgs): stream_options=request.stream_options, return_logprobs=bool(request.logprobs), top_logprobs=bool(request.top_logprobs), + chat_template_kwargs=request.chat_template_kwargs, ) @@ -108,9 +110,10 @@ def apply_reasoning_parser(args: ChatPostprocArgs, output_index: int, text: str, reasoning_parser = None if args.reasoning_parser is not None: if output_index not in args.reasoning_parser_dict: + chat_template_kwargs = getattr(args, "chat_template_kwargs", None) args.reasoning_parser_dict[ output_index] = ReasoningParserFactory.create_reasoning_parser( - args.reasoning_parser) + args.reasoning_parser, chat_template_kwargs) reasoning_parser = args.reasoning_parser_dict[output_index] if reasoning_parser is not None: @@ -501,6 +504,7 @@ class ChatCompletionPostprocArgs(PostprocArgs): tool_choice: Optional[Union[Literal["none", "auto"], ChatCompletionNamedToolChoiceParam]] request_id: Optional[int] = None + chat_template_kwargs: Optional[dict[str, Any]] = None @classmethod def from_request(cls, request: ChatCompletionRequest): @@ -508,6 +512,7 @@ class ChatCompletionPostprocArgs(PostprocArgs): model=request.model, tools=request.tools, tool_choice=request.tool_choice, + chat_template_kwargs=request.chat_template_kwargs, ) diff --git a/tests/unittest/llmapi/test_reasoning_parser.py b/tests/unittest/llmapi/test_reasoning_parser.py index 456a6674e2..2df9d1d32e 100644 --- a/tests/unittest/llmapi/test_reasoning_parser.py +++ b/tests/unittest/llmapi/test_reasoning_parser.py @@ -71,3 +71,81 @@ def test_qwen3_reasoning_parser_stream(delta_texts: list, content: list, result = reasoning_parser.parse_delta(delta_text) assert result.content == content[i] assert result.reasoning_content == reasoning_context[i] + + +@pytest.mark.parametrize( + ("text", "content", "reasoning_context", "chat_template_kwargs"), + [ + ("a b", "", "a b", None), + (f"{R1_END} a b", " a b", "", None), + (f"a {R1_END} b", " b", "a ", None), + (f"a b {R1_END}", "", "a b ", None), + (f"{R1_START} a {R1_END} b", " b", f"{R1_START} a ", None), + # All without reasoning_context. + ("a b", "a b", "", { + "enable_thinking": False + }), + (f"{R1_END} a b", f"{R1_END} a b", "", { + "enable_thinking": False + }), + (f"a {R1_END} b", f"a {R1_END} b", "", { + "enable_thinking": False + }), + (f"a b {R1_END}", f"a b {R1_END}", "", { + "enable_thinking": False + }), + ]) +def test_nano_v3_reasoning_parser(text: str, content: str, + reasoning_context: str, + chat_template_kwargs: dict): + reasoning_parser = ReasoningParserFactory.create_reasoning_parser( + "nano-v3", chat_template_kwargs) + result = reasoning_parser.parse(text) + print(f"text: {text}, result: {result}") + assert result.content == content + assert result.reasoning_content == reasoning_context + + +@pytest.mark.parametrize( + ("delta_texts", "content", "reasoning_context", "chat_template_kwargs"), + [ + (["a", "b"], ["", ""], ["a", "b"], None), + ([R1_END, "a", "b"], ["", "a", "b"], ["", "", ""], None), + (["a", R1_END, "b"], ["", "", "b"], ["a", "", ""], None), + (["a", "b", R1_END], ["", "", ""], ["a", "b", ""], None), + (["a", f"l{R1_END}", "b"], ["", "", "b"], ["a", "l", ""], None), + (["a", f"l{R1_END}r", "b"], ["", "r", "b"], ["a", "l", ""], None), + (["a", f"{R1_END}r", "b"], ["", "r", "b"], ["a", "", ""], None), + # All without reasoning_context. + (["a", "b"], ["a", "b"], ["", ""], { + "enable_thinking": False + }), + ([R1_END, "a", "b"], ["", f"{R1_END}a", "b"], ["", "", ""], { + "enable_thinking": False + }), + (["a", R1_END, "b"], ["a", "", f"{R1_END}b"], ["", "", ""], { + "enable_thinking": False + }), + (["a", "b", R1_END], ["a", "b", ""], ["", "", ""], { + "enable_thinking": False + }), + (["a", f"l{R1_END}", "b"], ["a", f"l{R1_END}", "b"], ["", "", ""], { + "enable_thinking": False + }), + (["a", f"l{R1_END}r", "b"], ["a", f"l{R1_END}r", "b"], ["", "", ""], { + "enable_thinking": False + }), + (["a", f"{R1_END}r", "b"], ["a", f"{R1_END}r", "b"], ["", "", ""], { + "enable_thinking": False + }), + ]) +def test_nano_v3_reasoning_parser_stream(delta_texts: list, content: list, + reasoning_context: list, + chat_template_kwargs: dict): + reasoning_parser = ReasoningParserFactory.create_reasoning_parser( + "nano-v3", chat_template_kwargs) + for i, delta_text in enumerate(delta_texts): + result = reasoning_parser.parse_delta(delta_text) + print(f"delta_text: {delta_text}, result: {result}") + assert result.content == content[i] + assert result.reasoning_content == reasoning_context[i] From 4f75a31a45069887242ab9611a53fe2b94dd057f Mon Sep 17 00:00:00 2001 From: arekay-nv <230885705+arekay-nv@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:49:31 -0600 Subject: [PATCH 151/172] [https://nvbugs/5540979][fix] Potential fix for 5540979 (#9716) Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com> --- tensorrt_llm/serve/openai_client.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorrt_llm/serve/openai_client.py b/tensorrt_llm/serve/openai_client.py index e46a232603..48172ca666 100644 --- a/tensorrt_llm/serve/openai_client.py +++ b/tensorrt_llm/serve/openai_client.py @@ -159,6 +159,7 @@ class OpenAIHttpClient(OpenAIClient): is_stream = request.stream for attempt in range(self._max_retries + 1): try: + lines_yielded = 0 start_time = get_steady_clock_now_in_seconds() async with self._session.post(url, json=json_data) as http_response: content_type = http_response.headers.get("Content-Type", "") @@ -172,6 +173,7 @@ class OpenAIHttpClient(OpenAIClient): async for line in self._response_generator( request, http_response, start_time, server, hooks ): + lines_yielded += 1 yield line # don't finish the request here since the response generator is not done yet else: @@ -183,6 +185,12 @@ class OpenAIHttpClient(OpenAIClient): await self._finish_request(request) break # break and skip retries if the whole response is processed without exception except (aiohttp.ClientError, OSError) as e: + if lines_yielded > 0: + logger.error( + f"Client error to {url}: {e} - cannot retry since {lines_yielded} lines were yielded", + traceback.format_exc(), + ) + raise if attempt == self._max_retries: logger.error( f"Client error to {url}: {e} - last retry {attempt} of {self._max_retries}" From 63e7a2fa7058a0e1f8f1be8c0780c69e2c66565e Mon Sep 17 00:00:00 2001 From: zackyoray Date: Mon, 15 Dec 2025 18:31:48 +0200 Subject: [PATCH 152/172] [None][infra] Update ucx to 1.20.x (#9977) Signed-off-by: Yoray Zack Signed-off-by: Yoray Zack <62789610+zackyoray@users.noreply.github.com> --- docker/common/install_ucx.sh | 8 ++++++-- jenkins/current_image_tags.properties | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh index 55da81e2c2..4a40679c80 100644 --- a/docker/common/install_ucx.sh +++ b/docker/common/install_ucx.sh @@ -1,7 +1,8 @@ #!/bin/bash set -ex -UCX_VERSION="v1.19.x" +UCX_VERSION="v1.20.x" +UCX_COMMIT="f656dbdf93e72e60b5d6ca78b9e3d9e744e789bd" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" UCX_REPO="https://github.com/openucx/ucx.git" @@ -9,7 +10,10 @@ UCX_REPO="https://github.com/openucx/ucx.git" mkdir -p /third-party-source rm -rf ${UCX_INSTALL_PATH} -git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} +git clone -b ${UCX_VERSION} ${UCX_REPO} +cd ucx +git checkout ${UCX_COMMIT} +cd .. tar -czf /third-party-source/ucx-${UCX_VERSION}.tar.gz ucx cd ucx ./autogen.sh diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 0787c16eb8..2ee623bae1 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512121105-9707 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512121105-9707 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512121105-9707 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512121105-9707 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512151112-9977 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512151112-9977 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512151112-9977 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512151112-9977 From 44b0f8c3ed89d7e6f7214e9c3f59c42d98e001ae Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Tue, 16 Dec 2025 00:52:52 +0800 Subject: [PATCH 153/172] [None] [fix] Revert "[None] [feat] add eos_token_id in generation_config to sampling params" (#10002) --- tensorrt_llm/sampling_params.py | 40 +++++++++++-------- .../apps/_test_trtllm_serve_top_logprobs.py | 2 +- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py index 3aa2260cee..57bebba45e 100644 --- a/tensorrt_llm/sampling_params.py +++ b/tensorrt_llm/sampling_params.py @@ -373,6 +373,14 @@ class SamplingParams: if self.end_id is None: self.end_id = tokenizer.eos_token_id self.pad_id = tokenizer.pad_token_id + # kimi_k2 model uses the eos_token_id in generation config + if ( + hf_model_config is not None + and hf_model_config.model_type == "kimi_k2" + and generation_config is not None + and isinstance(generation_config.eos_token_id, int) + ): + self.end_id = generation_config.eos_token_id if self.pad_id is None: self.pad_id = self.end_id @@ -392,26 +400,24 @@ class SamplingParams: strs = [self.stop] if isinstance(self.stop, str) else self.stop self._stop_word_ids = [_encode(tokenizer, s, add_special_tokens) for s in strs] - # Add eos_token_id in generation_config to _stop_word_ids - # Refer to https://huggingface.co/docs/hub/en/transformers#transformers-repository-files and - # https://github.com/huggingface/transformers/blob/1ae4d917ed3badbdb1ffc167e0529f5a6d3c080d/src/transformers/generation/stopping_criteria.py#L451C1-L451C42 - # The eos_token_id in generation_config are really mean to stop the text generation. - if generation_config is not None and generation_config.eos_token_id is not None: - if isinstance(generation_config.eos_token_id, int): - generation_eos_token_ids = [generation_config.eos_token_id] - else: # always List[int] - generation_eos_token_ids = generation_config.eos_token_id - - if self._stop_word_ids is None: - self._stop_word_ids = [generation_eos_token_ids] - else: + # add generation_config to stop word list, only in qwen3-next now + if ( + hf_model_config is not None + and hf_model_config.model_type == "qwen3_next" + and generation_config is not None + and isinstance(generation_config.eos_token_id, List) + and all(isinstance(i, int) for i in generation_config.eos_token_id) + ): + if self._stop_word_ids: all_stop_tokens_id = set(i for sublist in self._stop_word_ids for i in sublist) - from_generation_stop_token_ids = [ - i for i in generation_eos_token_ids if i not in all_stop_tokens_id + from_generation_stop_tokens = [ + i for i in generation_config.eos_token_id if i not in all_stop_tokens_id ] - if from_generation_stop_token_ids: - self._stop_word_ids.append(from_generation_stop_token_ids) + if from_generation_stop_tokens: + self._stop_word_ids.append(from_generation_stop_tokens) + else: + self._stop_word_ids = [generation_config.eos_token_id] return self diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py index c7a4fc7f16..d287e5e35e 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py @@ -110,7 +110,7 @@ async def test_chat_completion_top1_logprobs(async_client: openai.AsyncOpenAI, "content": "You are a helpful assistant." }, { "role": "user", - "content": "What is the capital of France? please in detail." + "content": "What is the capital of France?" }] # Test top_logprobs=1 chat_completion = await async_client.chat.completions.create( From 0c31502fbc168b01726fb91c165006dd036e1176 Mon Sep 17 00:00:00 2001 From: Faraz <58580514+farazkh80@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:07:06 -0500 Subject: [PATCH 154/172] [None][feat] disable fused gemm for sm121 (#9916) Signed-off-by: list <58580514+farazkh80@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_deepseekv3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 8df4eae706..605972ab5c 100755 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -518,7 +518,7 @@ class DeepseekV3Linear(Linear): layer_idx: Optional[int] | None = None): num_tokens = input.shape[0] if (not self.has_any_quant and 1 <= num_tokens <= 16 - and get_sm_version() != 120): + and get_sm_version() not in [120, 121]): output = torch.ops.trtllm.dsv3_fused_a_gemm_op( input, self.weight.t(), bias, None) else: From d5d15c06df436e476f1891a52ee8dcc4ecf5e2f0 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Tue, 16 Dec 2025 01:29:43 +0800 Subject: [PATCH 155/172] [None][infra] Waive failed tests for main branch on 12/15 (#10001) Signed-off-by: qqiao Signed-off-by: Yanchao Lu Co-authored-by: Yanchao Lu --- tests/integration/test_lists/waives.txt | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index f8866583f3..e829964a1a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -412,7 +412,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-kv_cache_aware] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-load_balancing] SKIP (https://nvbugs/5726066) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5726118) disaggregated/test_auto_scaling.py::test_worker_restart[http-kv_cache_aware] SKIP (https://nvbugs/5726066) @@ -447,3 +446,20 @@ unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_serve unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py SKIP (https://nvbugs/5741060) full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] SKIP (https://nvbugs/5596337) full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5596337) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377, https://nvbugs/5740075) +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix SKIP (https://nvbugs/5741331) +disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740087) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740359) +unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392) +unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740359) +disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] SKIP (https://nvbugs/5726066) From 9ba14263db0045ed3fa0860f949b5ce320107eb3 Mon Sep 17 00:00:00 2001 From: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:32:15 -0500 Subject: [PATCH 156/172] [https://nvbugs/5673559][fix] Unwaiving disagg test for nvbug 5673559 (#9957) Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index e829964a1a..3e064214fe 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -349,7 +349,6 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] SKIP ( test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] SKIP (https://nvbugs/5670469) test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5673578) examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826) examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] SKIP (https://nvbugs/5451216) From e6187d8109bc7107d6e0419d44e674e218e48df0 Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Mon, 15 Dec 2025 14:26:52 -0800 Subject: [PATCH 157/172] [https://nvbugs/5708810][fix] Fix TRTLLMSampler (#9710) Signed-off-by: Michal Guzek --- tensorrt_llm/_torch/pyexecutor/sampler.py | 6 ++- tensorrt_llm/executor/result.py | 12 ++++- .../_torch/sampler/test_trtllm_sampler.py | 51 +++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index c358c9eefe..0aefd2a439 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -3135,7 +3135,11 @@ class TRTLLMSampler(Sampler, AsyncWorkerMixin): ) } ] - cum_log_probs = [cum_log_probs_host[seq_slot]] + cum_log_probs = [ + cum_log_probs_host[seq_slot][0] + if isinstance(cum_log_probs_host[seq_slot], list) + else cum_log_probs_host[seq_slot] + ] request.py_result.append_log_probs([log_probs], cum_log_probs) idx += 1 diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index 28d35c43a7..603c567ed5 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -319,7 +319,14 @@ class GenerationResultBase: if response_tensors.request_perf_metrics is not None: output.request_perf_metrics = response_tensors.request_perf_metrics - if self._done: + # Check if this specific sequence is finished (not just if the entire request is done) + # This is important for best_of > n sampling where sequences finish at different times + sequence_is_finished = (finish_reasons and finish_reasons[src_idx] + != tllm.FinishReason.NOT_FINISHED + and finish_reasons[src_idx] + != tllm.FinishReason.CANCELLED) or self._done + + if sequence_is_finished: if finish_reasons[src_idx] == tllm.FinishReason.END_ID: output.finish_reason = 'stop' elif finish_reasons[src_idx] == tllm.FinishReason.STOP_WORDS: @@ -344,6 +351,9 @@ class GenerationResultBase: else: raise ValueError( f"Unknown finish reason: {finish_reasons[src_idx]}") + + # Only record stats and do tracing when the entire request is done + if self._done: self.record_stats(output, req_perf_metrics_dict) self.do_tracing(output, req_perf_metrics_dict) diff --git a/tests/unittest/_torch/sampler/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py index 355ab4cce7..032a7bc216 100644 --- a/tests/unittest/_torch/sampler/test_trtllm_sampler.py +++ b/tests/unittest/_torch/sampler/test_trtllm_sampler.py @@ -146,3 +146,54 @@ def test_torch_sampler_with_multi_token_stop_words(model_path): assert len(text) > 0, "Should generate some text" assert stop_string not in text, f"Stop string '{repr(stop_string)}' should not appear in the output" + + +@pytest.mark.high_cuda_memory +def test_trtllm_sampler_best_of_with_logprobs(model_path): + """Test TRTLLMSampler with best_of > n and logprobs.""" + + llm = create_llm(model_path) + + prompt = "The capital of France is" + + sampling_config = SamplingParams( + max_tokens=10, + temperature=1.0, + top_k=2, + n=2, # Return 2 sequences + best_of=3, # Generate 3 candidates, pick best 2 + logprobs=1 # Return log probabilities + ) + + outputs = llm.generate([prompt], sampling_params=sampling_config) + + llm.shutdown() + + assert len(outputs) == 1, "Should return one request output" + + request_output = outputs[0] + completion_outputs = request_output.outputs + + assert len( + completion_outputs + ) == 2, f"Expected 2 outputs (n=2), got {len(completion_outputs)}" + + for i, output in enumerate(completion_outputs): + assert len(output.text) > 0, f"Output {i} should have generated text" + + assert output.finish_reason is not None, \ + f"Output {i} must have a finish_reason" + + assert output.cumulative_logprob is not None, \ + f"Output {i} should have cumulative_logprob when logprobs is requested" + assert isinstance(output.cumulative_logprob, (float, int)), \ + f"Output {i} cumulative_logprob should be a number, got {type(output.cumulative_logprob)}" + + assert output.logprobs is not None, \ + f"Output {i} should have logprobs when logprobs=1" + assert len(output.logprobs) == len(output.token_ids), \ + f"Output {i} should have logprobs for each token" + + if len(completion_outputs) >= 2: + assert completion_outputs[0].cumulative_logprob >= completion_outputs[1].cumulative_logprob, \ + "Outputs should be sorted by cumulative log probability (best first)" From b757ea73ba4edf5f0fbefefe59cd951c4ab8a7c2 Mon Sep 17 00:00:00 2001 From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Date: Tue, 16 Dec 2025 10:58:59 +0800 Subject: [PATCH 158/172] [TRTLLM-9641][infra] Use public triton 3.5.0 in SBSA (#9652) Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8f740a9ede..b57ffa056f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -65,7 +65,7 @@ ninja etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a blake3 soundfile -triton==3.5.0; platform_machine == "x86_64" +triton==3.5.0 tiktoken blobfile openai-harmony==0.0.4 From cdf56c278f790d3b438445937dbee82750092be8 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Tue, 16 Dec 2025 10:59:13 +0800 Subject: [PATCH 159/172] [TRTLLM-8638][fix] Add failed cases into waives.txt New activity. (#9979) Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 3e064214fe..36711424f6 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -462,3 +462,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740359) disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] SKIP (https://nvbugs/5726066) +examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/5744293) +examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293) +examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293) +disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5741884) From 4ce35eacf1c679a7db50200a3e7c9721411ea89b Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Tue, 16 Dec 2025 11:50:41 +0800 Subject: [PATCH 160/172] [TRTLLM-9794][ci] move more test cases to gb200 (#9994) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- jenkins/L0_Test.groovy | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 73c4cd16c2..f63a032369 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2943,8 +2943,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1], - "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 2, 4], - "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4], + "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true], From dff77efa2ac827503386066a5a013fba3ce23819 Mon Sep 17 00:00:00 2001 From: ChristinaZ <83400082+ChristinaZ@users.noreply.github.com> Date: Tue, 16 Dec 2025 11:59:08 +0800 Subject: [PATCH 161/172] [None][feat] Add routing support for the new model for both cutlass and trtllm moe backend (#9792) Signed-off-by: Christina Zhang <83400082+ChristinaZ@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/noAuxTcKernels.cu | 47 ++++++--- .../blockScaleMoe/DevKernel.h | 28 +++--- .../blockScaleMoe/RoutingDeepSeek.cu | 96 +++++++++++-------- .../blockScaleMoe/RoutingKernel.h | 4 +- .../blockScaleMoe/RoutingKernelTopK.cuh | 2 +- .../blockScaleMoe/RoutingRenormalize.cu | 28 +++--- .../trtllmGenKernels/blockScaleMoe/runner.cu | 2 +- cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp | 2 +- cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp | 2 +- .../thop/fp8PerTensorScaleMoe.cpp | 2 +- cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp | 2 +- .../kernels/routing/routingDeepSeekTest.cpp | 33 +++++++ .../_torch/modules/fused_moe/routing.py | 9 +- .../_torch/thop/parallel/test_noaux_tc.py | 1 + tests/unittest/_torch/thop/serial/test_moe.py | 13 ++- 15 files changed, 180 insertions(+), 91 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu index 19eb4be4c1..efa69c7098 100644 --- a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu +++ b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu @@ -32,11 +32,14 @@ TRTLLM_NAMESPACE_BEGIN namespace kernels { static constexpr int WARP_SIZE = 32; +static constexpr int NumNemotronExperts = 512; static constexpr int NumKimiK2Experts = 384; static constexpr int NumDeepseekExperts = 256; +static constexpr int MaxSupportedExpertCount = std::max({NumNemotronExperts, NumKimiK2Experts, NumDeepseekExperts}); static constexpr int MaxNumExpertsUnit = 128; static constexpr int NumTopGroupScores = 2; -static constexpr int MaxNumTopExperts = 8; +static constexpr int DefaultMaxNumTopExperts = 8; +static constexpr int MaxSupportedTopExperts = 22; static constexpr int MaxNumTopGroups = 4; static __device__ inline float sigmoid_accurate(float x) @@ -44,7 +47,8 @@ static __device__ inline float sigmoid_accurate(float x) return 0.5f * tanhf(0.5f * x) + 0.5f; } -template +template __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, IdxT* topkIndices, BiasT* routingBias, int64_t const numTokens, int64_t const numGroup, int64_t const topkGroup, int64_t const topk, int64_t const numExperts, int64_t const numExpertsPerGroup, double const routedScalingFactor) @@ -132,7 +136,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx /* minValue */ invalidScoreFloat); // get the final group score and write it to shared - if (laneIdx == 0) + if (warp.thread_rank() == 0) { auto groupScore = topExpGroupScores[0] + topExpGroupScores[1]; smemGroupScores[warpIdx] = groupScore; @@ -151,9 +155,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx reduce_topk::reduceTopK(warp, topGroups, topGroupIdx, groupScore, laneIdx, /* minValue */ invalidScoreFloat); - // final expert selection: get relevant indexes and scores from shared - #pragma unroll for (int ii = 0; ii < MaxNumTopGroups; ++ii) { // bound of numGroup @@ -161,12 +163,11 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx expertIdxGroup[ii] = groupIdx * numExpertsPerGroup + laneIdx; expertScoreGroup[ii] - = groupIdx < numGroup && expertSelected ? smemScoreBias[expertIdxGroup[ii]] : invalidScoreFloat; + = (ii < topkGroup) && expertSelected ? smemScoreBias[expertIdxGroup[ii]] : invalidScoreFloat; } - tensorrt_llm::kernels::reduce_topk::reduceTopK(warp, topScores, topExperts, expertScoreGroup, - expertIdxGroup, - /* minValue */ invalidScoreFloat, topk); + tensorrt_llm::kernels::reduce_topk::reduceTopK( + warp, topScores, topExperts, expertScoreGroup, expertIdxGroup, /* minValue */ invalidScoreFloat, topk); } } else if constexpr (MaxNumExperts > MaxNumExpertsUnit) @@ -197,11 +198,16 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx smemInterTopScores[warpIdx * MaxNumTopExperts + laneIdx] = topScores[laneIdx]; smemInterTopExperts[warpIdx * MaxNumTopExperts + laneIdx] = topExperts[laneIdx]; } + else if (laneIdx >= topk && laneIdx < MaxNumTopExperts) + { + smemInterTopScores[warpIdx * MaxNumTopExperts + laneIdx] = invalidScoreFloat; + smemInterTopExperts[warpIdx * MaxNumTopExperts + laneIdx] = MaxNumExperts - 1; + } } __syncthreads(); if (warpIdx == 0) { - int constexpr NumInterTopKPerThread = (NumInterTopK * NumExpertWarps - 1) / WARP_SIZE + 1; + int constexpr NumInterTopKPerThread = (NumInterTopK - 1) / WARP_SIZE + 1; float intermidiateScore[NumInterTopKPerThread]; int32_t intermidiateExpert[NumInterTopKPerThread]; for (int i = laneIdx; i < NumInterTopKPerThread * WARP_SIZE; i += WARP_SIZE) @@ -268,11 +274,11 @@ void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk { // Check if we can use the optimized deepseek_v3_topk_kernel - bool const is_single_group = (n_group == 1) && (num_experts <= NumKimiK2Experts); + bool const is_single_group = (n_group <= 1) && (num_experts <= MaxSupportedExpertCount); int64_t const experts_per_group = num_experts / n_group; - bool const is_multi_group = (n_group != 1) && (num_experts <= NumDeepseekExperts) - && (experts_per_group <= WARP_SIZE) && (experts_per_group * topk_group <= MaxNumExpertsUnit); + bool const is_multi_group = (n_group > 1) && (num_experts <= NumDeepseekExperts) && (experts_per_group <= WARP_SIZE) + && (experts_per_group * topk_group <= MaxNumExpertsUnit); if (is_single_group || is_multi_group) { @@ -281,7 +287,20 @@ void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk int num_threads = NumDeepseekExperts; if (is_single_group) { - if (num_experts > MaxNumExpertsUnit) + // Special case for Nemotron, which selects top 22 from 512 experts, and 1 group only. + if (num_experts == NumNemotronExperts && n_group == 1 && topk == MaxSupportedTopExperts) + { + kernel_instance = &deepseek_v3_topk_kernel; + num_threads = NumNemotronExperts; + } + else if (num_experts > NumKimiK2Experts && num_experts <= MaxSupportedExpertCount) + { + kernel_instance + = &deepseek_v3_topk_kernel; + num_threads = MaxSupportedExpertCount; + } + else if (num_experts > MaxNumExpertsUnit && num_experts <= NumKimiK2Experts) { kernel_instance = &deepseek_v3_topk_kernel; num_threads = NumKimiK2Experts; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h index 7e8c4fb720..7edc3d1953 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h @@ -182,37 +182,37 @@ namespace moe::dev TLLM_LOG_ERROR("Unsupported dtypeExpW"); \ } -#define LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT( \ - data, coopLaunch, kernel, numBlocks, numThreads, smemSize, stream, extraFlag, forceFloatInput, numExperts) \ +#define LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, \ + stream, extraFlag, forceFloatInput, numExperts, numTopExperts) \ if (data.mDtypeExpW == tg::Dtype::Fp32 && extraFlag) \ { \ - LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, true), kernel, numBlocks, numThreads, \ - smemSize, stream); \ + LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, numTopExperts, true), kernel, numBlocks, \ + numThreads, smemSize, stream); \ } \ else if (data.mDtypeExpW == tg::Dtype::Fp32) \ { \ - LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, false), kernel, numBlocks, numThreads, \ - smemSize, stream); \ + LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, numTopExperts, false), kernel, numBlocks, \ + numThreads, smemSize, stream); \ } \ else if (data.mDtypeExpW == tg::Dtype::Bfloat16 && extraFlag && forceFloatInput) \ { \ - LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, __nv_bfloat16, numExperts, true), kernel, numBlocks, \ - numThreads, smemSize, stream); \ + LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, __nv_bfloat16, numExperts, numTopExperts, true), kernel, \ + numBlocks, numThreads, smemSize, stream); \ } \ else if (data.mDtypeExpW == tg::Dtype::Bfloat16 && extraFlag) \ { \ - LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, true), kernel, numBlocks, \ - numThreads, smemSize, stream); \ + LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, numTopExperts, true), \ + kernel, numBlocks, numThreads, smemSize, stream); \ } \ else if (data.mDtypeExpW == tg::Dtype::Bfloat16 && forceFloatInput) \ { \ - LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, __nv_bfloat16, numExperts, false), kernel, numBlocks, \ - numThreads, smemSize, stream); \ + LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, __nv_bfloat16, numExperts, numTopExperts, false), kernel, \ + numBlocks, numThreads, smemSize, stream); \ } \ else if (data.mDtypeExpW == tg::Dtype::Bfloat16) \ { \ - LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, false), kernel, numBlocks, \ - numThreads, smemSize, stream); \ + LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, numTopExperts, false), \ + kernel, numBlocks, numThreads, smemSize, stream); \ } \ else \ { \ diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu index 462fd5a091..6937a34ccd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu @@ -23,11 +23,13 @@ namespace routingDeepSeek { //////////////////////////////////////////////////////////////////////////////////////////////////// - +static constexpr int NumNemotronExperts = 512; static constexpr int NumKimiK2Experts = 384; static constexpr int NumDeepseekExperts = 256; +static constexpr int MaxSupportedExpertCount = std::max({NumNemotronExperts, NumKimiK2Experts, NumDeepseekExperts}); static constexpr int NumTopGroupScores = 2; -static constexpr int MaxNumTopExperts = 8; +static constexpr int DefaultMaxNumTopExperts = 8; +static constexpr int MaxSupportedTopExperts = 22; static constexpr int MaxNumTopGroups = 4; static constexpr int MaxNumGroups = 8; @@ -125,8 +127,8 @@ __global__ void routingMainKernel(KernelParams params) int32_t topGroupIdx[MaxNumTopGroups]; float expertScoreGroup[MaxNumTopGroups]; int32_t expertIdxGroup[MaxNumTopGroups]; - float topScores[MaxNumTopExperts]; // bound of params.mTopK - int32_t topExperts[MaxNumTopExperts]; + float topScores[KernelParams::MaxNumTopExperts]; // bound of params.mTopK + int32_t topExperts[KernelParams::MaxNumTopExperts]; if constexpr (KernelParams::UseGroups) { @@ -152,7 +154,6 @@ __global__ void routingMainKernel(KernelParams params) topk::reduceTopK(warp, topGroups, topGroupIdx, groupScore, laneIdx, /* minValue */ invalidScoreFloat); // final expert selection: get relevant indexes and scores from shared - #pragma unroll for (int ii = 0; ii < MaxNumTopGroups; ++ii) { // bound of params.mNumLimitedGroups @@ -164,7 +165,8 @@ __global__ void routingMainKernel(KernelParams params) // groupIdx * params.mNumExpertsPerGroup <= params.mNumExperts - params.mNumExpertsPerGroup // => expertIdxGroup[ii] < params.mNumExperts <= NumThreads, // so the access is safe here - expertScoreGroup[ii] = groupIdx < params.mNumExpertGroups && expertSelected + expertScoreGroup[ii] + = (ii < params.mNumLimitedGroups) && (groupIdx < params.mNumExpertGroups) && expertSelected ? smemScoreBias[expertIdxGroup[ii]] : invalidScoreFloat; } @@ -177,7 +179,7 @@ __global__ void routingMainKernel(KernelParams params) { // without groups, each thread just takes `MaxNumTopGroups` experts int constexpr NumExpertWarps = (KernelParams::MaxNumExperts - 1) / topk::MaxNumExpertsUnit + 1; - int constexpr NumInterTopK = NumExpertWarps * MaxNumTopExperts; + int constexpr NumInterTopK = NumExpertWarps * KernelParams::MaxNumTopExperts; __shared__ float __attribute((aligned(128))) smemInterTopScores[NumInterTopK]; __shared__ int32_t __attribute((aligned(128))) smemInterTopExperts[NumInterTopK]; if (warpIdx < NumExpertWarps) @@ -196,14 +198,20 @@ __global__ void routingMainKernel(KernelParams params) if (laneIdx < params.mTopK) { - smemInterTopScores[warpIdx * MaxNumTopExperts + laneIdx] = topScores[laneIdx]; - smemInterTopExperts[warpIdx * MaxNumTopExperts + laneIdx] = topExperts[laneIdx]; + smemInterTopScores[warpIdx * KernelParams::MaxNumTopExperts + laneIdx] = topScores[laneIdx]; + smemInterTopExperts[warpIdx * KernelParams::MaxNumTopExperts + laneIdx] = topExperts[laneIdx]; + } + else if (laneIdx >= params.mTopK && laneIdx < KernelParams::MaxNumTopExperts) + { + smemInterTopScores[warpIdx * KernelParams::MaxNumTopExperts + laneIdx] = invalidScoreFloat; + smemInterTopExperts[warpIdx * KernelParams::MaxNumTopExperts + laneIdx] + = MaxSupportedExpertCount - 1; } } __syncthreads(); if (warpIdx == 0) { - int constexpr NumInterTopKPerThread = (NumInterTopK * NumExpertWarps - 1) / WarpSize + 1; + int constexpr NumInterTopKPerThread = (NumInterTopK - 1) / WarpSize + 1; float intermidiateScore[NumInterTopKPerThread]; int32_t intermidiateExpert[NumInterTopKPerThread]; for (int i = laneIdx; i < NumInterTopKPerThread * WarpSize; i += WarpSize) @@ -295,7 +303,7 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Ke cudaGridDependencySynchronize(); } routingPermutation(params, nullptr, warpIdx, clusterBlockRank); + KernelParams::MaxNumTopExperts, /*LoadExpertIdxFromGlobal=*/true>(params, nullptr, warpIdx, clusterBlockRank); } #else __global__ void routingIndicesClusterKernel(KernelParams params) @@ -558,6 +566,10 @@ int constexpr getMaxNumExperts(int32_t numExperts) { return NumKimiK2Experts; } + else if (numExperts <= NumNemotronExperts) + { + return NumNemotronExperts; + } else { TLLM_LOG_ERROR("Unsupported numExperts"); @@ -571,17 +583,30 @@ int constexpr getMaxNumExperts(int32_t numExperts) if (data.mNumExperts <= topk::MaxNumExpertsUnit) \ { \ LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, \ - stream, extraFlag1, forceFloatInput, topk::MaxNumExpertsUnit); \ + stream, extraFlag1, forceFloatInput, topk::MaxNumExpertsUnit, DefaultMaxNumTopExperts); \ } \ else if (data.mNumExperts <= NumDeepseekExperts) \ { \ LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, \ - stream, extraFlag1, forceFloatInput, NumDeepseekExperts); \ + stream, extraFlag1, forceFloatInput, NumDeepseekExperts, DefaultMaxNumTopExperts); \ } \ else if (data.mNumExperts <= NumKimiK2Experts) \ { \ LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, \ - stream, extraFlag1, forceFloatInput, NumKimiK2Experts); \ + stream, extraFlag1, forceFloatInput, NumKimiK2Experts, DefaultMaxNumTopExperts); \ + } \ + else if (data.mNumExperts <= NumNemotronExperts) \ + { \ + if (data.mTopK <= DefaultMaxNumTopExperts) \ + { \ + LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks, numThreads, \ + smemSize, stream, extraFlag1, forceFloatInput, NumNemotronExperts, DefaultMaxNumTopExperts); \ + } \ + else if (data.mTopK <= MaxSupportedTopExperts) \ + { \ + LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks, numThreads, \ + smemSize, stream, extraFlag1, forceFloatInput, NumNemotronExperts, MaxSupportedTopExperts); \ + } \ } \ else \ { \ @@ -603,25 +628,6 @@ void run(Data& data, void* stream) (data.mPtrTopKPacked != nullptr || data.mPtrTopKIds != nullptr) && data.mPtrPermutedIdxSize, "If permuted index is required, `mPtrTopKPacked` or `mPtrTopKIds` is also required"); TLLM_CHECK_WITH_INFO(!data.mUseRoutingSoftmax, "Routing with softmax not implemented yet"); - TLLM_CHECK_WITH_INFO(data.mNumLimitedGroups <= MaxNumTopGroups, "Routing kernel expects <= %d top groups, got %d", - MaxNumTopGroups, data.mNumLimitedGroups); - TLLM_CHECK_WITH_INFO(data.mTopK <= MaxNumTopExperts, "Routing kernel expects topK experts <= %d, got %d", - MaxNumTopExperts, data.mTopK); - TLLM_CHECK_WITH_INFO(data.mTopK <= WarpSize, "Routing kernel expects top K <= warp size, got %d", data.mTopK); - TLLM_CHECK_WITH_INFO(data.mTopK * data.mNumLimitedGroups <= WarpSize, - "Routing kernel expects top K * top groups <= warp size (for now), got %d * %d", data.mTopK, - data.mNumLimitedGroups); - TLLM_CHECK_WITH_INFO(data.mNumExperts >= MaxNumTopExperts, "Routing kernel expects %d to be at most #experts %d", - MaxNumTopExperts, data.mNumExperts); - TLLM_CHECK_WITH_INFO(data.mNumExperts <= NumKimiK2Experts, "Routing kernel expects #experts %d <= #threads %d", - data.mNumExperts, NumKimiK2Experts); - TLLM_CHECK_WITH_INFO(data.mNumExpertGroups >= data.mNumLimitedGroups, - "Routing kernel expects top groups %d to be limited by #expert groups %d", data.mNumLimitedGroups, - data.mNumExpertGroups); - // Note: Routing-specific constraints (experts per group, topK limits) are checked later - // only when routing is actually needed (data.mPtrTopKIds == nullptr) - TLLM_CHECK_WITH_INFO( - data.mNumExperts % 4 == 0, "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts); int const numBlocks = data.mNumTokens; int const numThreadsHist = getMaxNumExperts(data.mNumExperts); @@ -655,9 +661,18 @@ void run(Data& data, void* stream) int const maxTokensCoop = (numBlocksCoop * numThreadsHist * 64) / data.mTopK; if (data.mPtrTopKIds == nullptr) { + TLLM_CHECK_WITH_INFO(data.mNumExperts >= MaxSupportedTopExperts, + "Routing kernel expects %d to be at most #experts %d", MaxSupportedTopExperts, data.mNumExperts); + TLLM_CHECK_WITH_INFO(data.mNumExperts <= MaxSupportedExpertCount, + "Routing kernel expects #experts %d <= #threads %d", data.mNumExperts, MaxSupportedExpertCount); + TLLM_CHECK_WITH_INFO(data.mTopK <= MaxSupportedTopExperts, "Routing kernel expects topK experts <= %d, got %d", + MaxSupportedTopExperts, data.mTopK); + // Routing needs to be executed - validate routing kernel constraints if (data.mNumExpertGroups > 1) { + // Note: Routing-specific constraints (experts per group, topK limits) are checked when routing is actually + // needed (data.mPtrTopKIds == nullptr) TLLM_CHECK_WITH_INFO(data.mNumExpertGroups <= MaxNumGroups, "Routing kernel expects #expert groups %d to be <= max groups %d", data.mNumExpertGroups, MaxNumGroups); TLLM_CHECK_WITH_INFO(data.mNumExperts % data.mNumExpertGroups == 0, @@ -667,14 +682,17 @@ void run(Data& data, void* stream) "Routing kernel expects #experts per group <= warp size (%d), got %d experts / %d groups = %d experts " "per group", WarpSize, data.mNumExperts, data.mNumExpertGroups, data.mNumExperts / data.mNumExpertGroups); - } - else - { - TLLM_CHECK_WITH_INFO(data.mTopK <= topk::MaxNumTopK, "Routing kernel expects top K %d to be <= max topk %d", - data.mTopK, topk::MaxNumTopK); + TLLM_CHECK_WITH_INFO(data.mNumLimitedGroups <= MaxNumTopGroups, + "Routing kernel expects <= %d top groups, got %d", MaxNumTopGroups, data.mNumLimitedGroups); + + TLLM_CHECK_WITH_INFO(data.mNumExpertGroups >= data.mNumLimitedGroups, + "Routing kernel expects top groups %d to be limited by #expert groups %d", data.mNumLimitedGroups, + data.mNumExpertGroups); + TLLM_CHECK_WITH_INFO(data.mNumExperts % 4 == 0, "Routing kernel expects #experts %d to be a multiple of 4.", + data.mNumExperts); } - int const numThreadsMain = data.mNumExperts < NumDeepseekExperts ? NumDeepseekExperts : NumKimiK2Experts; + int const numThreadsMain = max(data.mNumExpertGroups * WarpSize, getMaxNumExperts(data.mNumExperts)); LAUNCH_ROUTING_DEEPSEEK(data, /*coopLaunch=*/false, routingMainKernel, numBlocks, numThreadsMain, /*smemSize=*/0, // No dynamic smem diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h index d5aed6dbc9..888e04f254 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h @@ -189,13 +189,15 @@ struct Data : public DataBase bool mUseRoutingSoftmax; }; -template +template struct KernelParams : public KernelParamsBase { using InputT = InputT_; using OutputT = OutputT_; static constexpr bool UseGroups = UseGroups_; + static constexpr int MaxNumTopExperts = MaxNumTopExperts_; PackedScoreIdx* mPtrTopKPacked = nullptr; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernelTopK.cuh b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernelTopK.cuh index 2797baa6a9..7eab1c82a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernelTopK.cuh +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernelTopK.cuh @@ -35,7 +35,7 @@ namespace cg = cooperative_groups; static constexpr int WarpSize = 32; static constexpr int MaxNumExpertsUnit = 128; -static constexpr int MaxNumTopK = 10; +static constexpr int MaxSupportedTopExperts = 22; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu index 7a9cc1f732..67b6913aaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu @@ -25,7 +25,7 @@ static constexpr int NumExpertsLimit = 512; static constexpr int NumThreads = 1024; static constexpr int NumWarps = NumThreads / WarpSize; -static constexpr int MaxNumTopExperts = 10; +static constexpr int MaxSupportedTopExperts = 10; static constexpr int MaxNumTokensSingleCluster = NumBlocksPerCluster * NumThreads; static constexpr int MaxNumTokensSingleClusterScores = NumBlocksPerCluster * NumWarps; @@ -34,8 +34,8 @@ static constexpr int BlockKernelMaxNumTokens = 4; template __forceinline__ __device__ void routingTopKExperts(cg::thread_block_tile const& warp, - DataType (&score)[VecSize], int32_t (&idx)[VecSize], DataType (&warpTopKScore)[MaxNumTopExperts], - int32_t (&warpTopKExpertIdx)[MaxNumTopExperts], int32_t const laneIdx, int32_t const numExperts, int32_t topK, + DataType (&score)[VecSize], int32_t (&idx)[VecSize], DataType (&warpTopKScore)[MaxSupportedTopExperts], + int32_t (&warpTopKExpertIdx)[MaxSupportedTopExperts], int32_t const laneIdx, int32_t const numExperts, int32_t topK, InputType const* ptrScores, bool const normTopkProb, bool const applySoftmaxAfterTopK = true) { DataType minScore = DataType{-INFINITY}; @@ -149,8 +149,8 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts) routingIndicesBlo BaseType score[VecSize]; int32_t idx[VecSize]; - BaseType warpTopKScore[MaxNumTopExperts]; - int32_t warpTopKExpertIdx[MaxNumTopExperts]; + BaseType warpTopKScore[MaxSupportedTopExperts]; + int32_t warpTopKExpertIdx[MaxSupportedTopExperts]; BaseType minScore = BaseType{-INFINITY}; if (validToken) @@ -306,7 +306,7 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu static constexpr int VecSize = KernelParams::MaxNumExperts / WarpSize; - __shared__ TypePacked __attribute((aligned(128))) smemPackedScoreIdx[NumWarps * MaxNumTopExperts]; + __shared__ TypePacked __attribute((aligned(128))) smemPackedScoreIdx[NumWarps * MaxSupportedTopExperts]; uint32_t const clusterBlockRank = blockIdx.x; @@ -332,8 +332,8 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu BaseType score[VecSize]; int32_t idx[VecSize]; - BaseType warpTopKScore[MaxNumTopExperts]; - int32_t warpTopKExpertIdx[MaxNumTopExperts]; + BaseType warpTopKScore[MaxSupportedTopExperts]; + int32_t warpTopKExpertIdx[MaxSupportedTopExperts]; BaseType minScore = BaseType{-INFINITY}; if (validToken) @@ -356,12 +356,12 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu if (params.mPtrScores != nullptr) { - routingPermutation(params, smemPackedScoreIdx, warpIdx, clusterBlockRank); } else { - routingPermutation(params, smemPackedScoreIdx, warpIdx, clusterBlockRank); } } @@ -417,8 +417,8 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts) routingIndicesHis // over all warps/tokens BaseType allScores[VecSize]; int32_t allExpertIdx[VecSize]; - BaseType warpTopKScore[MaxNumTopExperts]; - int32_t warpTopKExpertIdx[MaxNumTopExperts]; + BaseType warpTopKScore[MaxSupportedTopExperts]; + int32_t warpTopKExpertIdx[MaxSupportedTopExperts]; for (int tokenIdx = globalWarpIdx; tokenIdx < params.mNumTokens; tokenIdx += globalWarpStride) { auto scoreOffset = tokenIdx * params.mNumExperts; @@ -486,8 +486,8 @@ void run(Data const& data, void* stream) TLLM_CHECK_WITH_INFO(data.mPtrPermutedIdxSize != nullptr && data.mPtrCtaIdxXyToBatchIdx != nullptr && data.mPtrCtaIdxXyToMnLimit != nullptr && data.mPtrNumNonExitingCtas != nullptr, "Llama4 routing kernel expects permuted idx and grouped Gemm launch config buffers"); - TLLM_CHECK_WITH_INFO(data.mTopK <= MaxNumTopExperts, "Routing kernel expects topK experts <= %d, got %d", - MaxNumTopExperts, data.mTopK); + TLLM_CHECK_WITH_INFO(data.mTopK <= MaxSupportedTopExperts, "Routing kernel expects topK experts <= %d, got %d", + MaxSupportedTopExperts, data.mTopK); TLLM_CHECK_WITH_INFO(data.mNumExperts <= NumExpertsLimit, "Routing kernel expects #experts %d to be no more than %d", data.mNumExperts, NumExpertsLimit); // static_assert(MaxNumExperts <= NumThreads, "#experts must be bounded by #threads"); diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu index d348d95cb6..81e420ec57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu @@ -70,7 +70,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3 { if (routingMethodType == RoutingMethodType::DeepSeekV3) { - TLLM_CHECK_WITH_INFO(topK <= 8, "For DeepSeek routing method, must have topK <= 8"); + TLLM_CHECK_WITH_INFO(topK <= 22, "For DeepSeek routing method, must have topK <= 22"); TLLM_CHECK_WITH_INFO(topkGroup <= 4, "For DeepSeek routing method, must have topkGroup <= 4"); moe::dev::routing::routingDeepSeek::Data routingData; routingData.mDtypeExpW = btg::Dtype::Bfloat16; diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp index 81746654a4..c9d9085614 100644 --- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp @@ -106,7 +106,7 @@ std::vector run_fp4_block_scale_moe_runner(torch::optional 1) { TORCH_CHECK(static_cast(routing_method_type) == RoutingMethodType::DeepSeekV3, "Routing kernel with groups implies DeepSeekV3 routing method."); diff --git a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp index b8e688d1d3..2db4e2bf6c 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp @@ -104,7 +104,7 @@ at::Tensor run_fp8_block_scale_moe(at::optional const& routing_logit TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape."); } - if (n_group.has_value() && n_group.value() != 0) + if (n_group.has_value() && n_group.value() > 1) { TORCH_CHECK(static_cast(routing_method_type) == RoutingMethodType::DeepSeekV3, "Routing kernel with groups implies DeepSeekV3 routing method."); diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp index 9681be6e7a..efefc06632 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp @@ -107,7 +107,7 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::optional con TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape."); } - if (n_group.has_value() && n_group.value() != 0) + if (n_group.has_value() && n_group.value() > 1) { TORCH_CHECK(static_cast(routing_method_type) == RoutingMethodType::DeepSeekV3, "Routing kernel with groups implies DeepSeekV3 routing method."); diff --git a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp index 087871593e..08bce0611b 100644 --- a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp @@ -114,7 +114,7 @@ torch::Tensor dtype_mxe2m1_block_scale_moe_runner(torch::optional TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape."); } - if (n_group.has_value() && n_group.value() != 0) + if (n_group.has_value() && n_group.value() > 1) { TORCH_CHECK(static_cast(routing_method_type) == RoutingMethodType::DeepSeekV3, "Routing kernel with groups implies DeepSeekV3 routing method."); diff --git a/cpp/tests/unit_tests/kernels/routing/routingDeepSeekTest.cpp b/cpp/tests/unit_tests/kernels/routing/routingDeepSeekTest.cpp index 3d82670472..0467c17496 100644 --- a/cpp/tests/unit_tests/kernels/routing/routingDeepSeekTest.cpp +++ b/cpp/tests/unit_tests/kernels/routing/routingDeepSeekTest.cpp @@ -244,6 +244,17 @@ TYPED_TEST(RoutingDeepSeekKernelTest, ClusterLevelParallelization384) this->runTest(param); }; +TYPED_TEST(RoutingDeepSeekKernelTest, ClusterLevelParallelization512) +{ + RoutingKernelTestParam param(RoutingMethodType::DeepSeekV3, /*numTokens=*/4, // 1024 + /*numExperts=*/512, /*topK=*/22, + /*expertParallelization=*/1, /*expertParallelizationId=*/0, /*tileTokensDim=*/256, + /*paddingLog2=*/3, /*localExpertsStrideLog2=*/0, + /*usePdl=*/true, /*getExpWeights=*/true, /*useTopKAsInput=*/false, /*hasInvalidTopKInput=*/false, + /*nGroup*/ 1, /*topkGroup*/ 1, /*routedScalingFactor*/ 1.0f, /*requiredComputeCapability*/ 9); + this->runTest(param); +}; + TYPED_TEST(RoutingDeepSeekKernelTest, ClusterLevelParallelization) { RoutingKernelTestParam param(RoutingMethodType::DeepSeekV3, /*numTokens=*/1024, // 10 @@ -310,6 +321,17 @@ TYPED_TEST(RoutingDeepSeekKernelTest, CooperativeLevelParallelization384) this->runTest(param); }; +TYPED_TEST(RoutingDeepSeekKernelTest, CooperativeLevelParallelization512) +{ + RoutingKernelTestParam param(RoutingMethodType::DeepSeekV3, /*numTokens=*/1030, + /*numExperts=*/512, /*topK=*/22, + /*expertParallelization=*/1, /*expertParallelizationId=*/0, /*tileTokensDim=*/256, + /*paddingLog2=*/3, /*localExpertsStrideLog2=*/0, + /*usePdl=*/true, /*getExpWeights=*/true, /*useTopKAsInput=*/false, /*hasInvalidTopKInput=*/false, + /*nGroup*/ 1, /*topkGroup*/ 1, /*routedScalingFactor*/ 1.0f, /*requiredComputeCapability*/ 10); + this->runTest(param); +}; + TYPED_TEST(RoutingDeepSeekKernelTest, DeviceLevelParallelization) { RoutingKernelTestParam param(RoutingMethodType::DeepSeekV3, /*numTokens=*/20300, @@ -332,6 +354,17 @@ TYPED_TEST(RoutingDeepSeekKernelTest, DeviceLevelParallelization384) this->runTest(param); }; +TYPED_TEST(RoutingDeepSeekKernelTest, DeviceLevelParallelization512) +{ + RoutingKernelTestParam param(RoutingMethodType::DeepSeekV3, /*numTokens=*/20300, + /*numExperts=*/512, /*topK=*/22, + /*expertParallelization=*/1, /*expertParallelizationId=*/0, /*tileTokensDim=*/256, + /*paddingLog2=*/3, /*localExpertsStrideLog2=*/0, + /*usePdl=*/true, /*getExpWeights=*/true, /*useTopKAsInput=*/false, /*hasInvalidTopKInput=*/false, + /*nGroup*/ 1, /*topkGroup*/ 1, /*routedScalingFactor*/ 1.0f, /*requiredComputeCapability*/ 10); + this->runTest(param); +}; + TYPED_TEST(RoutingDeepSeekKernelTest, ClusterLevelParallelizationTop2) { RoutingKernelTestParam param(RoutingMethodType::DeepSeekV3, /*numTokens=*/10, diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py index d879c6b003..85e2b2c98d 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/routing.py +++ b/tensorrt_llm/_torch/modules/fused_moe/routing.py @@ -263,7 +263,8 @@ class Deepseekv3RoutingImpl: ) self.is_fused = False else: - if num_experts > 384 or self.top_k > 8: + # We have special implementation for n_group == 1, top_k == 22 and num_experts == 512 for Nemotron Super v3. + if num_experts > 512 or (self.top_k > 8 and self.top_k != 22): if (self.is_fused): warnings.warn( "The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation." @@ -292,7 +293,11 @@ class Deepseekv3RoutingImpl: score_mask = group_mask.unsqueeze(-1).expand( scores_shape[:-1] + [n_group, scores_shape[-1] // n_group]).reshape(scores_shape) - scores_with_bias = scores_with_bias * score_mask + scores_with_bias = torch.where( + score_mask.bool(), scores_with_bias, + torch.tensor(float('-inf'), + dtype=scores_with_bias.dtype, + device=scores_with_bias.device)) _, topk_idx = torch.topk(scores_with_bias, k=self.top_k, dim=-1, diff --git a/tests/unittest/_torch/thop/parallel/test_noaux_tc.py b/tests/unittest/_torch/thop/parallel/test_noaux_tc.py index d1c44c0ac8..0e1437034f 100644 --- a/tests/unittest/_torch/thop/parallel/test_noaux_tc.py +++ b/tests/unittest/_torch/thop/parallel/test_noaux_tc.py @@ -9,6 +9,7 @@ from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3Gate (256, 8, 4, 8), (72, 1, 1, 6), (384, 1, 1, 8), + (512, 1, 1, 22), ]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py index 99ae844fc6..e252fc6047 100644 --- a/tests/unittest/_torch/thop/serial/test_moe.py +++ b/tests/unittest/_torch/thop/serial/test_moe.py @@ -1008,6 +1008,17 @@ class TestMoeFp4: "routing_method_type": RoutingMethodType.DeepSeekV3 }, id="RoutingDSv3"), + pytest.param( + { + "num_experts": 512, + "top_k": 22, + "n_groups": 1, + "top_k_groups": 1, + "routed_scaling": 2.5, + "has_routing_bias": True, + "routing_method_type": RoutingMethodType.DeepSeekV3 + }, + id="RoutingDS_SuperV3"), pytest.param( { "num_experts": 72, @@ -1238,7 +1249,7 @@ class TestMoeFp4: pytest.skip("https://nvbugs/5434352") assert top_k <= num_experts - assert top_k <= 10 + assert top_k <= 22 assert num_experts % 4 == 0 if use_topk_as_input: From 8ba8699f66b6d553598524de3be97625ec7814d5 Mon Sep 17 00:00:00 2001 From: Yechan Kim <161688079+yechank-nvidia@users.noreply.github.com> Date: Tue, 16 Dec 2025 13:05:20 +0900 Subject: [PATCH 162/172] [TRTLLM-8310][feat] Add Qwen3-VL-MoE (#9689) Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> --- requirements.txt | 3 +- .../_torch/attention_backend/interface.py | 3 +- tensorrt_llm/_torch/models/__init__.py | 2 + .../hf/qwen3vl_moe_weight_mapper.py | 24 + .../_torch/models/modeling_mistral.py | 12 +- .../models/modeling_multimodal_utils.py | 81 +- .../_torch/models/modeling_qwen2vl.py | 162 ++- tensorrt_llm/_torch/models/modeling_qwen3.py | 7 +- .../_torch/models/modeling_qwen3_moe.py | 26 +- .../_torch/models/modeling_qwen3_next.py | 4 - .../_torch/models/modeling_qwen3vl.py | 992 ++++++++++++++++++ .../_torch/models/modeling_qwen3vl_moe.py | 64 ++ .../_torch/models/modeling_speculative.py | 2 + tensorrt_llm/_torch/models/modeling_utils.py | 3 + tensorrt_llm/_torch/modules/attention.py | 2 +- .../_torch/modules/qk_norm_attention.py | 4 +- .../_torch/modules/rotary_embedding.py | 37 +- .../_torch/pyexecutor/cuda_graph_runner.py | 2 +- tensorrt_llm/tools/multimodal_builder.py | 2 +- .../defs/accuracy/references/mmmu.yaml | 2 + .../test_llm_api_pytorch_multimodal.py | 18 + .../test_lists/qa/llm_function_core.txt | 1 + .../test_lists/test-db/l0_l40s.yml | 1 + .../singlegpu/models/test_llama4_vlm_patch.py | 5 + .../singlegpu/test_ad_build_small_single.py | 13 + .../test_modeling_llama_min_latency.py | 5 +- .../modeling/test_modeling_multimodal.py | 8 +- .../modeling/test_modeling_qwen2_5vl.py | 16 +- .../modeling/test_modeling_qwen3vl_moe.py | 283 +++++ .../_torch/modeling/test_modeling_siglip.py | 3 +- triton_backend/requirements.txt | 3 +- 31 files changed, 1630 insertions(+), 160 deletions(-) create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_moe_weight_mapper.py create mode 100644 tensorrt_llm/_torch/models/modeling_qwen3vl.py create mode 100644 tensorrt_llm/_torch/models/modeling_qwen3vl_moe.py create mode 100644 tests/unittest/_torch/modeling/test_modeling_qwen3vl_moe.py diff --git a/requirements.txt b/requirements.txt index b57ffa056f..8ca6851bc7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ nvidia-modelopt[torch]~=0.37.0 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7 nvidia-nccl-cu13==2.27.7 nvidia-cuda-nvrtc -transformers==4.56.0 +transformers==4.57.1 prometheus_client prometheus_fastapi_instrumentator pydantic>=2.9.1 @@ -76,3 +76,4 @@ partial_json_parser apache-tvm-ffi==0.1.4 # used for reduce nvidia-cutlass-dsl host overhead torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf mistral-common==1.8.6 +torchao>=0.14.1 diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index 43244fc1bc..2326a264ed 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -578,8 +578,9 @@ class PositionalEmbeddingParams: rope: Optional[RopeParams] = None is_neox: bool = True - # mRoPE params (currently, Qwen2/2.5-VL uses it) + # mRoPE params mrope_section: Optional[List[int]] = None + mrope_interleaved: bool = False def __post_init__(self) -> None: if self.type.is_deferred(): diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py index 6c6c6a4f1d..59386dc20f 100644 --- a/tensorrt_llm/_torch/models/__init__.py +++ b/tensorrt_llm/_torch/models/__init__.py @@ -28,6 +28,7 @@ from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel from .modeling_qwen3 import Qwen3ForCausalLM from .modeling_qwen3_moe import Qwen3MoeForCausalLM from .modeling_qwen3_next import Qwen3NextForCausalLM +from .modeling_qwen3vl_moe import Qwen3MoeVLModel from .modeling_qwen_moe import Qwen2MoeForCausalLM from .modeling_seedoss import SeedOssForCausalLM from .modeling_siglip import SiglipVisionModel @@ -71,6 +72,7 @@ __all__ = [ "Qwen3ForCausalLM", "Qwen3MoeForCausalLM", "Qwen3NextForCausalLM", + "Qwen3MoeVLModel", "GptOssForCausalLM", "SeedOssForCausalLM", "Glm4MoeForCausalLM", diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_moe_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_moe_weight_mapper.py new file mode 100644 index 0000000000..cb72762c5d --- /dev/null +++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_moe_weight_mapper.py @@ -0,0 +1,24 @@ +from torch import nn + +from tensorrt_llm._torch.models.checkpoints.hf.qwen3_moe_weight_mapper import Qwen3MoeHfWeightMapper +from tensorrt_llm._torch.models.modeling_utils import register_mapper +from tensorrt_llm._torch.modules.fused_moe.interface import MoE + + +@register_mapper("HF", "Qwen3VLMoeForConditionalGeneration") +class Qwen3VLMoeHfWeightMapper(Qwen3MoeHfWeightMapper): + def handle_special_instance_module( + self, + module: nn.Module, + module_name: str, + module_weights: dict, + allow_partial_loading: bool = False, + ) -> None: + if isinstance(module, MoE): + updated_module_weights = {} + for weight_name, weight_value in module_weights.items(): + new_weight_name = weight_name.replace("scale_inv", "weight_scale") + updated_module_weights[new_weight_name] = weight_value + module.load_weights( + weights=[updated_module_weights], allow_partial_loading=allow_partial_loading + ) diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py index 2667d20d55..1fe669365b 100644 --- a/tensorrt_llm/_torch/models/modeling_mistral.py +++ b/tensorrt_llm/_torch/models/modeling_mistral.py @@ -1,6 +1,5 @@ import copy import dataclasses -import os from typing import Any, Dict, List, Tuple import torch @@ -20,7 +19,8 @@ from tensorrt_llm._torch.models.checkpoints.mistral.weight_mapper import \ from tensorrt_llm._torch.models.modeling_mistral_large3 import ( Mistral3Gate, MistralLarge3ForCausalLM) from tensorrt_llm._torch.models.modeling_multimodal_utils import ( - find_input_mm_embeds, fuse_input_embeds, get_multimodal_embeddings) + _MULTIMODAL_ENV_NAME, _is_disagg, find_input_mm_embeds, fuse_input_embeds, + get_multimodal_embeddings) from tensorrt_llm._torch.models.modeling_utils import (DecoderModel, DecoderModelForCausalLM, _load_weights_impl, @@ -45,13 +45,6 @@ from tensorrt_llm.inputs.multimodal import MultimodalParams from tensorrt_llm.llmapi import SamplingParams from tensorrt_llm.logger import logger -_MULTIMODAL_ENV_NAME = "TLLM_MULTIMODAL_DISAGGREGATED" - - -# Make this a runtime lookup rather than a module-wide constant for easier unit testing. -def _is_disagg() -> bool: - return os.getenv(_MULTIMODAL_ENV_NAME, "0") == "1" - class MistralAttention(Attention): @@ -373,6 +366,7 @@ class Mistral3VLM(PreTrainedModel): ) config = model_config.pretrained_config + self._supports_sdpa = True super().__init__(config) vision_feature_layer = getattr(config, "vision_feature_layer", -1) diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py index d76397a9fb..1901fca549 100644 --- a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py +++ b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py @@ -17,7 +17,8 @@ # and s2wrapper: https://github.com/bfshi/scaling_on_scales import math -from typing import Any, Dict, List, Optional, Tuple, cast +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast import torch import torch.nn.functional as F @@ -29,6 +30,13 @@ from tensorrt_llm._torch.modules.embedding import Embedding from tensorrt_llm.inputs.multimodal import MultimodalParams from tensorrt_llm.logger import logger +_MULTIMODAL_ENV_NAME = "TLLM_MULTIMODAL_DISAGGREGATED" + + +# Make this a runtime lookup rather than a module-wide constant for easier unit testing. +def _is_disagg() -> bool: + return os.getenv(_MULTIMODAL_ENV_NAME, "0") == "1" + def _get_uncached_multimodal_params( multimodal_params: List[MultimodalParams], ) -> List[MultimodalParams]: @@ -67,17 +75,17 @@ def _cache_multimodal_embeddings( mostly for chunked prefill. It does not persist embeddings across different requests or sessions. """ # TODO: support multiple multimodal modalities per request - assert len( - embeddings - ) == 1, "Currently only support single mm_embeds (single modality) per request" + if len(embeddings) > 1: + raise ValueError("Multiple modalities caching is not supported yet.") mm_embed = embeddings[0] # Collect embedding lengths for each parameter - embed_lengths = [ - param.multimodal_runtime.total_mm_tokens_in_request - - param.multimodal_runtime.total_special_tokens_in_request - for param in multimodal_params if param.multimodal_runtime is not None - ] + embed_lengths = [] + for param in multimodal_params: + if param.multimodal_runtime is not None: + embed_lengths.append( + param.multimodal_runtime.total_mm_tokens_in_request - + param.multimodal_runtime.total_special_tokens_in_request) # Validate total length matches total_expected = sum(embed_lengths) @@ -103,7 +111,10 @@ def _cache_multimodal_embeddings( def get_multimodal_embeddings( - encoder_forward_fn, + encoder_forward_fn: Callable[ + [List[MultimodalParams]], + Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, Any]]], + ], multimodal_params: List[MultimodalParams], encoder_kwargs: Optional[Dict[str, Any]] = None, ) -> List[torch.Tensor]: @@ -117,12 +128,13 @@ def get_multimodal_embeddings( 4. Gather all embeddings for the batch Args: - encoder_forward_fn: Callable that performs encoder forward pass - Should accept List[MultimodalParams] and return List[torch.Tensor] - multimodal_params: All multimodal parameters in the batch - + encoder_forward_fn: Callable that performs encoder forward pass. + Should accept List[MultimodalParams] and return List[torch.Tensor] or + Tuple[List[torch.Tensor], Dict[str, Any]] for models with auxiliary outputs. + multimodal_params: All multimodal parameters in the batch. + encoder_kwargs: Optional kwargs to pass to encoder_forward_fn. Returns: - List of multimodal embeddings for all multimodal params in the batch + List of multimodal embeddings for all multimodal params in the batch. """ if not multimodal_params: return [] @@ -134,12 +146,13 @@ def get_multimodal_embeddings( # Step 2: Run encoder forward only on uncached parameters if uncached_multimodal_params: kwargs = encoder_kwargs or {} - encoder_outputs = encoder_forward_fn(uncached_multimodal_params, - **kwargs) + encoder_embeddings = encoder_forward_fn(uncached_multimodal_params, + **kwargs) # TODO: support multiple multimodal modalities per request - if len(encoder_outputs) > 1: - return encoder_outputs + if len(encoder_embeddings) > 1: + logger.warning("Multiple modalities caching is not supported yet.") + return encoder_embeddings # Validate that multimodal_runtime has required attributes for caching if (not hasattr(uncached_multimodal_params[0], 'multimodal_runtime') @@ -147,13 +160,13 @@ def get_multimodal_embeddings( or uncached_multimodal_params[0].multimodal_runtime. total_mm_tokens_in_request is None): logger.warning( - "Multimodal runtime data missing or incomplete - recomputed all embeddings" + "Multimodal runtime data missing or incomplete, will not cache embeddings." ) - return encoder_outputs + return encoder_embeddings # Step 3: Cache the computed embeddings to multimodal_data["multimodal_embedding"] _cache_multimodal_embeddings(uncached_multimodal_params, - encoder_outputs) + encoder_embeddings) # Step 4: Gather all embeddings for the batch for param in multimodal_params: @@ -301,8 +314,12 @@ def fuse_input_embeds( mm_token_ids: Optional[torch.IntTensor] = None, text_token_indices: Optional[torch.IntTensor] = None, mm_token_indices: Optional[torch.IntTensor] = None, + extra_embeds: Optional[List[torch.Tensor]] = None, **kwargs, -) -> Tuple[Optional[torch.IntTensor], Optional[torch.FloatTensor]]: + # TODO: make unified return type for all models +) -> Union[Tuple[Optional[torch.IntTensor], Optional[torch.FloatTensor]], + Tuple[Optional[torch.IntTensor], Optional[torch.FloatTensor], + Optional[List[torch.FloatTensor]]]]: """ Fuse text and multimodal embeddings. input_ids is [text_total_length + mm_total_length] and mm_embed is [mm_total_length, hidden_dim]. We just need to fuse them into [text_total_length + mm_total_length, hidden_dim] by slice-and-assign to the corresponding entries. @@ -311,6 +328,7 @@ def fuse_input_embeds( input_ids: shape [text_total_length + mm_total_length], flattened from List[(text_length1 + mm_total_length1), ..., (text_lengthi + mm_total_lengthi)]. For LLM model, the requests are inflight batched together, but the input_ids are flattened with padding removed. By the slice condition < vocab_size, we can easily separate text / multimodal tokens and naturally batched the LLM embedding lookup mm_embeds: List[(mm_total_length1, hidden_dim), ..., (mm_total_lengthi, hidden_dim)]. mm_token_ids: possible token ids for multimodal tokens, if known. If not known and set to None, it is assumed that the multimodal tokens are out-of-vocabulary tokens. + extra_embeds: Optional list of extra embed tensors for models that support it (e.g., Qwen3-VL/Qwen3-MoE-VL). Returns: - If (1) JIT test run, (2) non-multimodal run, i.e. all text-only requests, either context or generation phase (3) multimodal run, all requests in generation phase --> there is no multimodal data, return only the input_ids - If (4) multimodal run, mixed batch of context and generation requests, each context request has a multimodal feature --> return only the fused input_embeds of shape [total length, hidden_dim]. For text tokens, LLM embedding layer has already run. @@ -319,6 +337,8 @@ def fuse_input_embeds( - This function may involve host-device synchronization if indices are not provided and filtering is performed. See filter_mm_token_from_input_ids for details. """ if len(mm_embeds) == 0: + if extra_embeds is not None and len(extra_embeds) > 0: + return input_ids, None, extra_embeds return input_ids, None mm_embed = torch.cat(mm_embeds, dim=0) @@ -330,7 +350,6 @@ def fuse_input_embeds( input_ids, vocab_size=embedding_layer.num_embeddings, mm_token_ids=mm_token_ids) - if mm_token_indices.shape[0] != mm_embed.shape[0]: raise ValueError( f"Multimodal token count mismatch: found {len(mm_token_indices)} image tokens in input_ids " @@ -343,11 +362,23 @@ def fuse_input_embeds( mm_embed.shape[-1], device=text_embed.device, dtype=text_embed.dtype) + if extra_embeds is not None and len(extra_embeds) > 0: + # only support single modality for deepstack features for now + for i, extra_feature in enumerate(extra_embeds): + extra_embed = torch.zeros( + input_ids.shape[0], + mm_embed.shape[-1], + device=extra_feature.device, + dtype=extra_feature.dtype, + ) + extra_embed[mm_token_indices, :] = extra_feature + extra_embeds[i] = extra_embed input_embeds[text_token_indices, :] = text_embed input_embeds[mm_token_indices, :] = mm_embed.to(dtype=input_embeds.dtype, device=input_embeds.device) - + if extra_embeds is not None and len(extra_embeds) > 0: + return None, cast(torch.FloatTensor, input_embeds), extra_embeds return None, cast(torch.FloatTensor, input_embeds) diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py index 0e77f4aa30..6740188f3d 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py @@ -1,5 +1,4 @@ import copy -import os import re from typing import Any, Dict, List, Optional, Tuple, Union @@ -10,8 +9,8 @@ from transformers import (AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel) from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VisionPatchEmbed, Qwen2_5_VisionRotaryEmbedding, - Qwen2_5_VisionTransformerPretrainedModel, Qwen2_5_VLMLP, - Qwen2_5_VLVisionBlock, apply_rotary_pos_emb_vision) + Qwen2_5_VisionTransformerPretrainedModel, Qwen2_5_VLVisionBlock, + apply_rotary_pos_emb_vision) from transformers.models.qwen2_vl.modeling_qwen2_vl import \ Qwen2VisionTransformerPretrainedModel @@ -21,8 +20,9 @@ from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \ BaseWeightMapper from tensorrt_llm._torch.models.checkpoints.hf.qwen2vl_weight_mapper import \ Qwen2VLHfWeightMapper +from tensorrt_llm._torch.models.modeling_multimodal_utils import _is_disagg from tensorrt_llm._torch.modules.attention import Attention -from tensorrt_llm._torch.modules.linear import Linear +from tensorrt_llm._torch.modules.linear import Linear, TensorParallelMode from tensorrt_llm._torch.modules.rms_norm import RMSNorm from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.inputs.multimodal import MultimodalParams @@ -38,6 +38,7 @@ from ...sampling_params import SamplingParams from ..attention_backend import AttentionMetadata from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams from ..attention_backend.utils import get_attention_backend +from ..modules.gated_mlp import GatedMLP from ..modules.rotary_embedding import MRotaryEmbedding from .modeling_auto import AutoModelForCausalLM from .modeling_multimodal_utils import (find_input_mm_embeds, fuse_input_embeds, @@ -46,48 +47,9 @@ from .modeling_utils import (ModelConfig, QuantConfig, _load_weights_impl, filter_weights, register_auto_model, register_vision_encoder) -DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1' PAD_INDEX = -100 # NOTE: refer to https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L269 -def process_weights(weights: Dict, - prefix: str = "visual", - weight_name_mapping: Dict[str, str] = None) -> Dict: - """ - Filter and transform weights in a single modular function. - - Args: - weights: Dictionary of all model weights - prefix: Prefix to filter weights by (default: "visual") - weight_name_mapping: Optional mapping to transform weight names - - Returns: - Dictionary of processed weights ready for loading - """ - - # Filter weights by prefix (handles both direct and "model." prefixed keys) - filtered_weights = {} - for key, weight in weights.items(): - if key.startswith(prefix): - filtered_weights[key] = weight - elif key.startswith("model." + prefix): - filtered_weights[key[len("model."):]] = weight - - # Transform weight names if mapping provided - if weight_name_mapping: - transformed_weights = {} - for key, weight in filtered_weights.items(): - new_key = key - for old_suffix, new_suffix in weight_name_mapping.items(): - if key.endswith(old_suffix): - new_key = key.replace(old_suffix, new_suffix) - break - transformed_weights[new_key] = weight - return transformed_weights - - return filtered_weights - - class Qwen2VLInputProcessorBase(BaseMultimodalInputProcessor, BaseMultimodalDummyInputsBuilder): @@ -310,7 +272,7 @@ class Qwen2VLInputProcessorBase(BaseMultimodalInputProcessor, mrope_position_deltas, device=input_ids.device).unsqueeze(1) return position_ids, mrope_position_deltas - def _preprocess(self, text: dict[str, any], mm_data: dict[str, any], + def _preprocess(self, text: Dict[str, any], mm_data: Dict[str, any], mm_processor_kwargs: Dict[str, Any]): images = mm_data.get("image") video_datas = mm_data.get("video") @@ -323,8 +285,6 @@ class Qwen2VLInputProcessorBase(BaseMultimodalInputProcessor, do_rescale = False if videos and isinstance(videos[0][0], torch.Tensor): do_rescale = False - # transformers=4.53.1 does not support GPU video tensors in Qwen2VL processor. - videos = [[frame.to("cpu") for frame in video] for video in videos] return self.processor(text=[text], images=images, videos=videos, @@ -346,7 +306,7 @@ class Qwen2VLInputProcessorBase(BaseMultimodalInputProcessor, image_grid_thw: torch.LongTensor, video_grid_thw: torch.LongTensor, attention_mask: torch.Tensor, - second_per_grid_ts: torch.Tensor = None) -> dict[str, torch.Tensor]: + second_per_grid_ts: torch.Tensor = None) -> Dict[str, torch.Tensor]: mrope_position_ids, mrope_position_deltas = Qwen2VLInputProcessorBase.get_rope_index( self.config, input_ids, image_grid_thw, video_grid_thw, attention_mask, second_per_grid_ts) @@ -437,6 +397,10 @@ class Qwen2VisionModelBase(nn.Module): def load_weights(self, weights: Dict): visual_weights = filter_weights("visual", weights) converted_weights = dict() + if isinstance(self.visual, (Qwen2VisionTransformerPretrainedModel, + Qwen2_5_VisionTransformerPretrainedModel)): + self.visual.load_state_dict(visual_weights, strict=True) + return qkv_pattern = re.compile(r'(.*?)attn\.qkv\.(.*)') for name in visual_weights: @@ -559,13 +523,13 @@ class Qwen2_5_VLVisionAttention(Attention): self, hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]], + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]], **kwargs, ) -> torch.Tensor: # NOTE: Need separate Attention forward() for Qwen2.5-VL for multiple reasons # 1. We don't have the route for handing over position_embeddings to the Attention forward() # 2. Could not override the apply_rope() as we don't have the position_ids in the Vision Attention's rotary embedding. - # (TODO: yechank-nvidia) Make OOTO path more modular and reusable for Attention's Rotary Embedding. + # (TODO: yechank-nvidia) Make OOTB path more modular and reusable for Attention's Rotary Embedding. qkv = self.qkv_proj(hidden_states) q, k, v = qkv, None, None @@ -593,10 +557,26 @@ class Qwen2_5_VLVisionAttention(Attention): return attn_output +class Qwen2_5_VLMLP(GatedMLP): + + def __init__(self, model_config: ModelConfig[PretrainedConfig], + layer_idx: int): + config = model_config.pretrained_config.vision_config + super().__init__( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + bias=True, + activation=F.silu, + dtype=model_config.pretrained_config.torch_dtype, + config=model_config, + layer_idx=layer_idx, + ) + + class Qwen2_5_VLVisionBlock(torch.nn.Module): def __init__(self, model_config: ModelConfig[PretrainedConfig], - layer_idx: Optional[int]): + layer_idx: int): super().__init__() config = model_config.pretrained_config.vision_config self.norm1 = RMSNorm(hidden_size=config.hidden_size, @@ -606,14 +586,15 @@ class Qwen2_5_VLVisionBlock(torch.nn.Module): eps=model_config.pretrained_config.rms_norm_eps, dtype=model_config.pretrained_config.torch_dtype) self.attn = Qwen2_5_VLVisionAttention(model_config, layer_idx) - self.mlp = Qwen2_5_VLMLP(config, bias=True) + self.mlp = Qwen2_5_VLMLP(model_config, layer_idx) @torch.inference_mode() def forward( self, hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, rotary_pos_emb: Optional[torch.Tensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> torch.Tensor: @@ -621,6 +602,7 @@ class Qwen2_5_VLVisionBlock(torch.nn.Module): hidden_states = self.norm1(hidden_states) hidden_states = residual + self.attn( hidden_states=hidden_states, + attn_metadata=attn_metadata, rotary_pos_emb=rotary_pos_emb, position_embeddings=position_embeddings, **kwargs, @@ -650,21 +632,25 @@ class Qwen2_5_VLPatchMerger(torch.nn.Module): out_features=self.hidden_size, bias=True, dtype=model_config.pretrained_config.torch_dtype, - mapping=model_config.mapping), + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.COLUMN, + allreduce_strategy=model_config.allreduce_strategy), torch.nn.GELU(), Linear(in_features=self.hidden_size, out_features=dim, bias=True, dtype=model_config.pretrained_config.torch_dtype, - mapping=model_config.mapping), + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.ROW, + allreduce_strategy=model_config.allreduce_strategy), ) @torch.inference_mode() - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.ln_q(x) - x = x.view(-1, self.hidden_size) - x = self.mlp(x) - return x + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.ln_q(hidden_states) + hidden_states = hidden_states.view(-1, self.hidden_size) + hidden_states = self.mlp(hidden_states) + return hidden_states class Qwen2_5_VisionModel(torch.nn.Module): @@ -740,7 +726,7 @@ class Qwen2_5_VisionModel(torch.nn.Module): return rotary_pos_emb def get_window_index(self, grid_thw): - window_index: list = [] + window_index: List[torch.Tensor] = [] seq_lens = [] window_index_id = 0 vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size @@ -783,13 +769,12 @@ class Qwen2_5_VisionModel(torch.nn.Module): return window_index, seq_lens def prepare_attn_metadata(self, seq_lens, attn_metadata: AttentionMetadata): - # NOTE: The single prompt is divided into multiple seq_lens, so pretending have many batch_sizes. - batch_size = len(seq_lens) + batch_size = 1 # NOTE: Qwen2/2.5-VL concats all the pixel_values into a single tensor, so batch_size is 1 prompt_lens = seq_lens seq_lens = torch.tensor(seq_lens, dtype=torch.int, pin_memory=True) request_ids = list(range(1, batch_size + 1)) - attn_metadata.num_contexts = batch_size + attn_metadata.num_contexts = len(seq_lens) attn_metadata.request_ids = request_ids attn_metadata.prompt_lens = prompt_lens attn_metadata.seq_lens = seq_lens @@ -798,7 +783,7 @@ class Qwen2_5_VisionModel(torch.nn.Module): return attn_metadata @torch.inference_mode() - def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, + def forward(self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor: window_index, window_seq_lens = self.get_window_index(grid_thw) seq_lens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], @@ -814,7 +799,7 @@ class Qwen2_5_VisionModel(torch.nn.Module): window_seq_lens, self.window_attn_metadata) # From this point, pure GPU operation - hidden_states = self.patch_embed(hidden_states) + hidden_states = self.patch_embed(pixel_values) seq_len, _ = hidden_states.size() hidden_states = hidden_states.reshape( seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) @@ -834,7 +819,6 @@ class Qwen2_5_VisionModel(torch.nn.Module): attn_metadata = full_attn_metadata else: attn_metadata = window_attn_metadata - hidden_states = block( hidden_states, attn_metadata=attn_metadata, @@ -857,30 +841,27 @@ class Qwen2VLModelBase(PreTrainedModel): self.original_arch = model_config.pretrained_config.architectures[0] # NOTE: Setting disable_fuse_rope to True to do mrope fusion in the model engine by pre-computing rotary_cos_sin in the model engine - disabble_fuse_rope = kwargs.get('disable_fuse_rope', False) - model_config.pretrained_config.disable_fuse_rope = disabble_fuse_rope + disable_fuse_rope = kwargs.get('disable_fuse_rope', False) + model_config.pretrained_config.disable_fuse_rope = disable_fuse_rope model_config.pretrained_config.rope_scaling['type'] = 'mrope' config = model_config.pretrained_config self._supports_sdpa = True super().__init__(config) - if not disabble_fuse_rope: - self.init_mrope_embedding(model_config) - self.model_config = model_config self.config = model_config.pretrained_config if model_config.attn_backend != 'TRTLLM': raise ValueError("Qwen2/2.5-VL only supports TRTLLM backend now") - if not disabble_fuse_rope: + if not disable_fuse_rope: self.init_mrope_embedding(model_config) llm_model_config = copy.deepcopy(model_config) llm_model_config.pretrained_config.architectures = ["Qwen2ForCausalLM"] self.llm = AutoModelForCausalLM.from_config(llm_model_config) - if not DISAGG: + if not _is_disagg(): mm_encoder_config = copy.deepcopy(model_config) self.mm_encoder = Qwen2VisionModelBase( mm_encoder_config, kwargs.get('vision_model_class', None)) @@ -977,21 +958,28 @@ class Qwen2VLModelBase(PreTrainedModel): multimodal_params = kwargs.get("multimodal_params", []) mm_embeds = [] mrope_config = {} - if len(multimodal_params) > 0: - if not DISAGG: + # NOTE: Qwen*-VL series has mrope_config even on the text-only prompts, so we need to separate the mm_multimodal_params from the text-only prompts. + mm_multimodal_params = [ + multimodal_param for multimodal_param in multimodal_params + if multimodal_param.multimodal_data.get("image", {}).get( + "pixel_values") is not None or multimodal_param.multimodal_data. + get("video", {}).get("pixel_values_videos") is not None + ] + if len(mm_multimodal_params) > 0: + if not _is_disagg(): mm_embeds = get_multimodal_embeddings( encoder_forward_fn=self.mm_encoder.forward, - multimodal_params=multimodal_params[:num_context_requests]) + multimodal_params=mm_multimodal_params) else: raise NotImplementedError( "Qwen2VLModel does not support disaggregated inference yet. Please unset " f"the TLLM_MULTIMODAL_DISAGGREGATED environment variable, or set it to '0'." ) - mm_embeds = find_input_mm_embeds( - mm_embeds, multimodal_params[:num_context_requests]) - if not self.model_config.pretrained_config.disable_fuse_rope: - mrope_config = self.prepare_mrope_config( - multimodal_params, num_context_requests) + mm_embeds = find_input_mm_embeds(mm_embeds, mm_multimodal_params) + + if not self.model_config.pretrained_config.disable_fuse_rope: + mrope_config = self.prepare_mrope_config(multimodal_params, + num_context_requests) input_ids, input_embeds = fuse_input_embeds(self.llm.model.embed_tokens, input_ids, mm_embeds, @@ -1038,9 +1026,8 @@ class Qwen2VLModel(Qwen2VLModelBase): ] def load_weights(self, weights, weight_mapper: BaseWeightMapper): - if not DISAGG: - vision_encoder_weights = process_weights(weights, "visual") - self.mm_encoder.load_state_dict(vision_encoder_weights, strict=True) + if not _is_disagg(): + self.mm_encoder.load_weights(weights) self.llm.load_weights(weights, weight_mapper) @@ -1063,8 +1050,9 @@ class Qwen2_5_VLModel(Qwen2VLModelBase): def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): kwargs['vision_model_class'] = Qwen2_5_VisionModel - kwargs[ - 'disable_fuse_rope'] = False # TODO: Make this ModelConfig's argument + kwargs['disable_fuse_rope'] = kwargs.get( + 'disable_fuse_rope', + False) # TODO: Make this ModelConfig's argument super().__init__(model_config, *args, **kwargs) @property @@ -1078,7 +1066,7 @@ class Qwen2_5_VLModel(Qwen2VLModelBase): if isinstance(weight_mapper, Qwen2VLHfWeightMapper): weights = weight_mapper.preprocess_weights(weights) - if not DISAGG: + if not _is_disagg(): self.mm_encoder.load_weights(weights) self.llm.load_weights(weights) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3.py b/tensorrt_llm/_torch/models/modeling_qwen3.py index 81f0ce3360..3775de51ec 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3.py @@ -48,7 +48,11 @@ class Qwen3Attention(QKNormRoPEAttention): pos_embd_params = PositionalEmbeddingParams( type=PositionEmbeddingType.from_string(pos_type), rope=RopeParams.from_config(config), - ) + mrope_section=config.rope_scaling.get("mrope_section", None), + mrope_interleaved=config.rope_scaling.get( + "mrope_interleaved", False)) + if config.rope_scaling.get("mrope_interleaved", False): + fuse_qk_norm_rope = False else: pos_embd_params = PositionalEmbeddingParams( type=PositionEmbeddingType.rope_gpt_neox, @@ -64,6 +68,7 @@ class Qwen3Attention(QKNormRoPEAttention): pos_embd_params=pos_embd_params, fuse_qk_norm_rope=fuse_qk_norm_rope, layer_idx=layer_idx, + rope_fusion=not getattr(config, 'disable_fuse_rope', False), dtype=config.torch_dtype, dense_bias=getattr(config, "attention_bias", None), config=model_config, diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index 506b8a1473..e05ad149bd 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -18,7 +18,7 @@ from ..modules.fused_moe import (BaseMoeRoutingMethod, CutlassFusedMoE, RenormalizeNaiveMoeRoutingMethod, RoutingMethodType, TRTLLMGenFusedMoE, create_moe, get_moe_cls) -from ..modules.fused_moe.interface import MoE +from ..modules.fused_moe.interface import MoE, MoEWeightLoadingMode from ..modules.linear import TensorParallelMode from ..modules.rms_norm import RMSNorm from ..speculative import SpecMetadata @@ -114,6 +114,7 @@ class Qwen3MoE(nn.Module): moe_backend_cls=get_moe_cls(model_config), ) + self.weight_loading_mode = MoEWeightLoadingMode.FUSED_GATE_UP_PROJ if config.model_type == "qwen3_vl_moe_text" else MoEWeightLoadingMode.VANILLA self.experts = create_moe( num_experts=self.num_experts, routing_method=self.gate.routing_method, @@ -124,6 +125,7 @@ class Qwen3MoE(nn.Module): reduce_results=False, model_config=model_config, layer_idx=layer_idx, + weight_loading_mode=self.weight_loading_mode, ) def forward( @@ -221,6 +223,8 @@ class Qwen3MoEDecoderLayer(DecoderLayer): attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], spec_metadata: Optional[SpecMetadata] = None, + mrope_config: Optional[Dict[str, torch.Tensor]] = None, + deepstack_embeds: Optional[List[torch.Tensor]] = None, **kwargs, ) -> torch.Tensor: if residual is None: @@ -236,6 +240,7 @@ class Qwen3MoEDecoderLayer(DecoderLayer): attn_metadata=attn_metadata, all_reduce_params=AllReduceParams( enable_allreduce=not self.disable_attn_allreduce), + mrope_config=mrope_config, **kwargs, ) @@ -269,6 +274,10 @@ class Qwen3MoEDecoderLayer(DecoderLayer): do_finalize=do_finalize, ) + if deepstack_embeds is not None and self.layer_idx in range( + len(deepstack_embeds)): + residual = residual + deepstack_embeds[self.layer_idx] + if self.fusion_config.POST_MOE_FUSION: if do_finalize: hidden_states, residual = self.allreduce( @@ -365,6 +374,8 @@ class Qwen3MoEModel(DecoderModel): position_ids: Optional[torch.IntTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, spec_metadata: Optional[SpecMetadata] = None, + mrope_config: Optional[Dict[str, torch.Tensor]] = None, + deepstack_embeds: Optional[List[torch.Tensor]] = None, **kwargs, ) -> torch.Tensor: if (input_ids is None) ^ (inputs_embeds is not None): @@ -379,11 +390,14 @@ class Qwen3MoEModel(DecoderModel): residual = None for decoder_layer in self.layers: - hidden_states, residual = decoder_layer(position_ids=position_ids, - hidden_states=hidden_states, - attn_metadata=attn_metadata, - residual=residual, - spec_metadata=spec_metadata) + hidden_states, residual = decoder_layer( + position_ids=position_ids, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + residual=residual, + spec_metadata=spec_metadata, + mrope_config=mrope_config, + deepstack_embeds=deepstack_embeds) return hidden_states diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py index 8061be539e..926ebc1ace 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_next.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_next.py @@ -23,7 +23,6 @@ import torch.nn.functional as F import triton import triton.language as tl from torch import nn -from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig from transformers.modeling_rope_utils import rope_config_validation @@ -320,9 +319,6 @@ class Qwen3NextConfig(PretrainedConfig): self.mlp_only_layers = mlp_only_layers -AutoConfig.register("qwen3_next", Qwen3NextConfig) - - class Qwen3NextGate(nn.Module): def __init__( diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py new file mode 100644 index 0000000000..3e423feb29 --- /dev/null +++ b/tensorrt_llm/_torch/models/modeling_qwen3vl.py @@ -0,0 +1,992 @@ +import copy +import re +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel +from transformers.activations import ACT2FN as HF_ACT2FN +from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLVisionPatchEmbed as HFQwen3VLVisionPatchEmbed, +) +from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLVisionRotaryEmbedding as HFQwen3VLVisionRotaryEmbedding, +) + +from tensorrt_llm._torch.models.modeling_multimodal_utils import _is_disagg +from tensorrt_llm.functional import PositionEmbeddingType + +from ..._utils import nvtx_range, nvtx_range_debug +from ...inputs import ( + BaseMultimodalDummyInputsBuilder, + BaseMultimodalInputProcessor, + ExtraProcessedInputs, + TextPrompt, +) +from ...inputs.multimodal import MultimodalParams +from ...logger import logger +from ...sampling_params import SamplingParams +from ..attention_backend import AttentionMetadata +from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams +from ..attention_backend.utils import get_attention_backend +from ..modules.layer_norm import LayerNorm +from ..modules.linear import Linear, TensorParallelMode +from ..modules.mlp import MLP +from ..modules.rotary_embedding import MRotaryEmbedding +from .modeling_auto import AutoModelForCausalLM +from .modeling_multimodal_utils import ( + find_input_mm_embeds, + fuse_input_embeds, + get_multimodal_embeddings, +) +from .modeling_qwen2vl import Qwen2_5_VLVisionAttention +from .modeling_utils import ModelConfig, QuantConfig, _load_weights_impl, filter_weights + + +class Qwen3VLInputProcessorBase(BaseMultimodalInputProcessor, BaseMultimodalDummyInputsBuilder): + def __init__( + self, + model_path: str, + config: PretrainedConfig, + tokenizer: AutoTokenizer, + trust_remote_code: bool = True, + **kwargs, + ): + super().__init__( + model_path=model_path, + config=config, + tokenizer=tokenizer, + trust_remote_code=trust_remote_code, + **kwargs, + ) + self._dtype = self.config.text_config.dtype + self._tokenizer = ( + tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained(model_path) + ) + self._model_path = model_path + self._processor = AutoProcessor.from_pretrained( + model_path, use_fast=True, trust_remote_code=trust_remote_code + ) + self.tllm_multimodal_token_id = self.get_vocab_size() + 1 + # temporal patch size for video frames + self.temporal_patch_size = getattr(self.config.vision_config, "temporal_patch_size", 1) + + @property + def config(self) -> PretrainedConfig: + return self._config + + @property + def tokenizer(self) -> AutoTokenizer: + return self._tokenizer + + @property + def model_path(self) -> str: + return self._model_path + + @property + def processor(self) -> AutoProcessor: + return self._processor + + @property + def dtype(self) -> torch.dtype: + return self._dtype + + def get_vocab_size(self) -> int: + """Return the vocab size of the model.""" + return self.config.text_config.vocab_size + + @classmethod + def get_rope_index( + cls, + model_config: PretrainedConfig, + input_ids: Optional[torch.LongTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Different from the original implementation, Qwen3VL use timestamps rather than absolute time position ids.""" + + # Since we use timestamps to separate videos, like + # , the video_grid_thw should also be split + if video_grid_thw is not None: + video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0) + video_grid_thw[:, 0] = 1 + + spatial_merge_size = model_config.vision_config.spatial_merge_size + image_token_id = model_config.image_token_id + video_token_id = model_config.video_token_id + vision_start_token_id = model_config.vision_start_token_id + mrope_position_deltas = [] + if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) + position_ids = torch.ones( + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, + ) + image_index, video_index = 0, 0 + attention_mask = attention_mask.to(total_input_ids.device) + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + image_nums, video_nums = 0, 0 + vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) + vision_tokens = input_ids[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + input_tokens = input_ids.tolist() + llm_pos_ids_list: list = [] + st = 0 + remain_images, remain_videos = image_nums, video_nums + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode + # the temporal information for videos) + t_index = ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor( + mrope_position_deltas, device=input_ids.device + ).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def _preprocess( + self, text: Dict[str, Any], mm_data: Dict[str, Any], mm_processor_kwargs: Dict[str, Any] + ): + images = mm_data.get("image") + video_datas = mm_data.get("video") + if video_datas is not None: + videos = [video_data.frames for video_data in video_datas] + else: + videos = None + do_rescale = True + if images and isinstance(images[0], torch.Tensor): + do_rescale = False + if videos and isinstance(videos[0][0], torch.Tensor): + do_rescale = False + return self.processor( + text=[text], + images=images, + videos=videos, + padding=True, + do_rescale=do_rescale, + return_tensors="pt", + **mm_processor_kwargs, + ) + + def _postprocess(self, input_ids: torch.IntTensor) -> torch.IntTensor: + masks = (input_ids == self.config.image_token_id) | ( + input_ids == self.config.video_token_id + ) + input_ids[masks] = self.tllm_multimodal_token_id + return input_ids + + def get_mrope_config( + self, + input_ids: torch.IntTensor, + image_grid_thw: torch.LongTensor, + video_grid_thw: torch.LongTensor, + attention_mask: torch.Tensor, + ) -> dict[str, torch.Tensor]: + mrope_position_ids, mrope_position_deltas = Qwen3VLInputProcessorBase.get_rope_index( + self.config, input_ids, image_grid_thw, video_grid_thw, attention_mask + ) + + mrope_config = {} + mrope_config["mrope_position_ids"] = mrope_position_ids.to("cpu").clone() + mrope_config["mrope_position_deltas"] = ( + mrope_position_deltas.to("cpu").to(torch.int32).clone() + ) + + return mrope_config + + @nvtx_range("Qwen3VLInputProcessorBase forward()") + @torch.inference_mode() + def __call__( + self, + inputs: TextPrompt, + sampling_params: SamplingParams, + ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]: + text_prompt, mm_data, mm_processor_kwargs = ( + inputs.get("prompt"), + inputs.get("multi_modal_data", {}), + inputs.get("mm_processor_kwargs", {}), + ) + with nvtx_range_debug("transformers input preprocess"): + processed_inputs = self._preprocess(text_prompt, mm_data, mm_processor_kwargs) + + multimodal_data = {} + pixel_values = processed_inputs.get("pixel_values", None) + if pixel_values is not None: + multimodal_data["image"] = { + "pixel_values": pixel_values.to(self.dtype), + "image_grid_thw": processed_inputs.get("image_grid_thw"), + } + + pixel_values_videos = processed_inputs.get("pixel_values_videos", None) + if pixel_values_videos is not None: + multimodal_data["video"] = { + "pixel_values_videos": pixel_values_videos.to(self.dtype), + "video_grid_thw": processed_inputs.get("video_grid_thw"), + } + + # NOTE: Even on the text-only prompts, we still need 'mrope_position_ids'. + mrope_config = self.get_mrope_config( + processed_inputs["input_ids"], + processed_inputs.get("image_grid_thw", None), + processed_inputs.get("video_grid_thw", None), + processed_inputs.get("attention_mask", None), + ) + multimodal_data["mrope_config"] = mrope_config + + fused_input_ids = processed_inputs["input_ids"][0] + if mm_data: + fused_input_ids = self._postprocess(fused_input_ids) + + return fused_input_ids.to(torch.int32).tolist(), { + "multimodal_data": multimodal_data, + } + + +class Qwen3VLVisionAttention(Qwen2_5_VLVisionAttention): + def __init__(self, model_config, layer_idx): + model_config.pretrained_config.max_position_embeddings = ( + model_config.pretrained_config.text_config.max_position_embeddings + ) + model_config.pretrained_config.vision_config.torch_dtype = ( + model_config.pretrained_config.text_config.dtype + ) + super().__init__(model_config, layer_idx) + + +class Qwen3VLVisionMLP(MLP): + def __init__(self, model_config: ModelConfig[PretrainedConfig], layer_idx: int): + config = model_config.pretrained_config.vision_config + super().__init__( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + bias=True, + activation=HF_ACT2FN[config.hidden_act], + dtype=model_config.pretrained_config.text_config.dtype, + config=model_config, + layer_idx=layer_idx, + ) + + +class Qwen3VLVisionBlock(torch.nn.Module): + def __init__(self, model_config: ModelConfig[PretrainedConfig], layer_idx: int): + super().__init__() + config = model_config.pretrained_config.vision_config + + self.norm1 = LayerNorm( + hidden_size=config.hidden_size, + eps=model_config.pretrained_config.text_config.rms_norm_eps, + dtype=model_config.pretrained_config.text_config.dtype, + ) + self.norm2 = LayerNorm( + hidden_size=config.hidden_size, + eps=model_config.pretrained_config.text_config.rms_norm_eps, + dtype=model_config.pretrained_config.text_config.dtype, + ) + self.attn = Qwen3VLVisionAttention(model_config, layer_idx) + self.mlp = Qwen3VLVisionMLP(model_config, layer_idx) + + @torch.inference_mode() + def forward( + self, + hidden_states: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states = residual + self.attn( + hidden_states=hidden_states, + rotary_pos_emb=rotary_pos_emb, + position_embeddings=position_embeddings, + **kwargs, + ) + + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = residual + self.mlp(hidden_states) + return hidden_states + + +class Qwen3VLVisionPatchMerger(torch.nn.Module): + def __init__( + self, model_config: ModelConfig[PretrainedConfig], use_postshuffle_norm: bool = False + ) -> None: + super().__init__() + config = model_config.pretrained_config.vision_config + self.hidden_size = config.hidden_size * (config.spatial_merge_size**2) + self.use_postshuffle_norm = use_postshuffle_norm + self.norm = LayerNorm( + hidden_size=self.hidden_size if use_postshuffle_norm else config.hidden_size, + eps=model_config.pretrained_config.text_config.rms_norm_eps, + dtype=model_config.pretrained_config.text_config.dtype, + ) + self.linear_fc1 = Linear( + in_features=self.hidden_size, + out_features=self.hidden_size, + bias=True, + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.COLUMN, + allreduce_strategy=model_config.allreduce_strategy, + ) + self.act_fn = nn.GELU() + self.linear_fc2 = Linear( + in_features=self.hidden_size, + out_features=config.out_hidden_size, + bias=True, + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.ROW, + allreduce_strategy=model_config.allreduce_strategy, + ) + + @torch.inference_mode() + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + if self.use_postshuffle_norm: + hidden_states = hidden_states.view(-1, self.hidden_size) + + hidden_states = self.norm(hidden_states).view(-1, self.hidden_size) + hidden_states = self.linear_fc1(hidden_states) + hidden_states = self.act_fn(hidden_states) + hidden_states = self.linear_fc2(hidden_states) + return hidden_states + + +class Qwen3VisionModel(torch.nn.Module): + def __init__(self, model_config: ModelConfig[PretrainedConfig]): + super().__init__() + self.model_config = model_config + self.config = self.model_config.pretrained_config.vision_config + + self.spatial_merge_size = self.config.spatial_merge_size + self.patch_size = self.config.patch_size + self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size + + self.patch_embed = HFQwen3VLVisionPatchEmbed( + config=self.config, + ) + + self.pos_embed = nn.Embedding(self.config.num_position_embeddings, self.config.hidden_size) + self.num_grid_per_side = int(self.config.num_position_embeddings**0.5) + + head_dim = self.config.hidden_size // self.config.num_heads + self.rotary_pos_emb = HFQwen3VLVisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList( + [ + Qwen3VLVisionBlock(model_config, layer_idx=layer_idx) + for layer_idx in range(self.config.depth) + ] + ) + self.merger = Qwen3VLVisionPatchMerger( + model_config=model_config, + use_postshuffle_norm=False, + ) + self.deepstack_visual_indexes = self.config.deepstack_visual_indexes + self.deepstack_merger_list = nn.ModuleList( + [ + Qwen3VLVisionPatchMerger( + model_config=model_config, + use_postshuffle_norm=True, + ) + for _ in range(len(self.deepstack_visual_indexes)) + ] + ) + self.metadata_cls = get_attention_backend(self.model_config.attn_backend).Metadata + + self.attn_metadata = self.metadata_cls( + max_num_requests=8192, # TODO: Make this dynamic + max_num_tokens=8192, # TODO: Make this dynamic + kv_cache_manager=None, + ) + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + merge_size = self.spatial_merge_size + + max_hw = int(grid_thw[:, 1:].max().item()) + freq_table = self.rotary_pos_emb(max_hw) # (max_hw, dim // 2) + device = freq_table.device + + total_tokens = int(torch.prod(grid_thw, dim=1).sum().item()) + pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device) + + offset = 0 + for num_frames, height, width in grid_thw: + merged_h, merged_w = height // merge_size, width // merge_size + + block_rows = torch.arange(merged_h, device=device) # block row indices + block_cols = torch.arange(merged_w, device=device) # block col indices + intra_row = torch.arange(merge_size, device=device) # intra-block row offsets + intra_col = torch.arange(merge_size, device=device) # intra-block col offsets + + # Compute full-resolution positions + row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None] + col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :] + + row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1) + col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1) + + coords = torch.stack((row_idx, col_idx), dim=-1) + + if num_frames > 1: + coords = coords.repeat(num_frames, 1) + + num_tokens = coords.shape[0] + pos_ids[offset : offset + num_tokens] = coords + offset += num_tokens + + embeddings = freq_table[pos_ids] # lookup rotary embeddings + embeddings = embeddings.flatten(1) + return embeddings + + def fast_pos_embed_interpolate(self, grid_thw): + grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2] + + idx_list = [[] for _ in range(4)] + weight_list = [[] for _ in range(4)] + + for t, h, w in zip(grid_ts, grid_hs, grid_ws): + h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h) + w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w) + + h_idxs_floor = h_idxs.int() + w_idxs_floor = w_idxs.int() + h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + + dh = h_idxs - h_idxs_floor + dw = w_idxs - w_idxs_floor + + base_h = h_idxs_floor * self.num_grid_per_side + base_h_ceil = h_idxs_ceil * self.num_grid_per_side + + indices = [ + (base_h[None].T + w_idxs_floor[None]).flatten(), + (base_h[None].T + w_idxs_ceil[None]).flatten(), + (base_h_ceil[None].T + w_idxs_floor[None]).flatten(), + (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(), + ] + + weights = [ + ((1 - dh)[None].T * (1 - dw)[None]).flatten(), + ((1 - dh)[None].T * dw[None]).flatten(), + (dh[None].T * (1 - dw)[None]).flatten(), + (dh[None].T * dw[None]).flatten(), + ] + + for i in range(4): + idx_list[i].extend(indices[i].tolist()) + weight_list[i].extend(weights[i].tolist()) + + idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=self.pos_embed.weight.device) + weight_tensor = torch.tensor( + weight_list, dtype=self.pos_embed.weight.dtype, device=self.pos_embed.weight.device + ) + pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None] + patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3] + + patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)]) + + patch_pos_embeds_permute = [] + merge_size = self.config.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + patch_pos_embeds_permute.append(pos_embed) + patch_pos_embeds = torch.cat(patch_pos_embeds_permute) + return patch_pos_embeds + + def prepare_attn_metadata(self, seq_lens, attn_metadata: AttentionMetadata): + # NOTE: The single prompt is divided into multiple seq_lens, so pretending have many batch_sizes. + batch_size = len(seq_lens) + prompt_lens = seq_lens + seq_lens = torch.tensor(seq_lens, dtype=torch.int, pin_memory=True) + request_ids = list(range(1, batch_size + 1)) + + attn_metadata.num_contexts = batch_size + attn_metadata.request_ids = request_ids + attn_metadata.prompt_lens = prompt_lens + attn_metadata.seq_lens = seq_lens + attn_metadata.max_seq_len = seq_lens.max().item() + attn_metadata.prepare() + return attn_metadata + + @torch.inference_mode() + def forward( + self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs + ) -> torch.Tensor: + seq_lens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).tolist() + attn_metadata = self.prepare_attn_metadata(seq_lens, self.attn_metadata) + + # Getting positional embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # From this point, pure GPU operation + hidden_states = self.patch_embed(hidden_states) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape(seq_len, -1) + + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + + deepstack_feature_lists = [] + for layer_num, block in enumerate(self.blocks): + hidden_states = block( + hidden_states, + attn_metadata=attn_metadata, + position_embeddings=position_embeddings, + ) + if layer_num in self.deepstack_visual_indexes: + deepstack_feature = self.deepstack_merger_list[ + self.deepstack_visual_indexes.index(layer_num) + ](hidden_states) + deepstack_feature_lists.append(deepstack_feature) + hidden_states = self.merger(hidden_states) + + return hidden_states, deepstack_feature_lists + + +class Qwen3VisionModelBase(nn.Module): + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + model_class: Union[type[PreTrainedModel], type[torch.nn.Module]], + ): + super().__init__() + self.model_config = model_config + self.model_dtype = self.model_config.pretrained_config.text_config.dtype + + # NOTE: Re-setting QuantConfig to exclude vision encoder weights from quantization load. + self.model_config.quant_config = QuantConfig( + kv_cache_quant_algo=self.model_config.quant_config.kv_cache_quant_algo + ) + + self.visual = model_class(self.model_config).to(self.model_dtype) + + self.post_config() + + def post_config(self): + self.config = self.model_config.pretrained_config.vision_config + + def load_weights(self, weights: Dict[str, torch.Tensor]): + visual_weights = filter_weights("model.visual", weights) + converted_weights = {} + + qkv_pattern = re.compile(r"(.*?)attn\.qkv\.(.*)") + for name in visual_weights: + # Handle with weights and bias for vision transformer's qkv projection. + match = qkv_pattern.match(name) + if match: + prefix, suffix = match.groups() + q_name = f"{prefix}attn.q_proj.{suffix}" + k_name = f"{prefix}attn.k_proj.{suffix}" + v_name = f"{prefix}attn.v_proj.{suffix}" + dim_shape = visual_weights[name].shape[0] // 3 + converted_weights[q_name] = visual_weights[name][:dim_shape] + converted_weights[k_name] = visual_weights[name][dim_shape : 2 * dim_shape] + converted_weights[v_name] = visual_weights[name][2 * dim_shape :] + else: + converted_weights[name] = visual_weights[name] + pattern_mapping = { + r"(.*?)attn.proj.(.*)": r"\1attn.o_proj.\2", + r"(.*?)mlp.linear_fc1.(.*)": r"\1mlp.up_proj.\2", + r"(.*?)mlp.linear_fc2.(.*)": r"\1mlp.down_proj.\2", + } + self.visual.config.num_attention_heads = self.visual.config.num_heads + _load_weights_impl(self.visual, converted_weights, params_map=pattern_mapping) + + def _parse_and_batch_multimodal_data( + self, multimodal_params: List[MultimodalParams] + ) -> Tuple[Dict[str, Any], Dict[str, List[Any]]]: + pixel_values_list = [] + pixel_values_videos_list = [] + image_grid_thw_list = [] + video_grid_thw_list = [] + + for multimodal_param in multimodal_params: + multimodal_data = multimodal_param.multimodal_data + # Process images if present + if multimodal_data.get("image") is not None: + pixel_values_list.append(multimodal_data["image"]["pixel_values"]) + image_grid_thw_list.append(multimodal_data["image"]["image_grid_thw"]) + + # Process videos if present + if multimodal_data.get("video") is not None: + pixel_values_videos_list.append(multimodal_data["video"]["pixel_values_videos"]) + video_grid_thw_list.append(multimodal_data["video"]["video_grid_thw"]) + + # Concatenate tensors + mm_content_dict = {} + if pixel_values_list: + mm_content_dict["pixel_values"] = ( + torch.cat(pixel_values_list, dim=0) + if len(pixel_values_list) > 1 + else pixel_values_list[0] + ) + if pixel_values_videos_list: + mm_content_dict["pixel_values_videos"] = ( + torch.cat(pixel_values_videos_list, dim=0) + if len(pixel_values_videos_list) > 1 + else pixel_values_videos_list[0] + ) + + # Prepare extra data + mm_extra_data = {} + if image_grid_thw_list: + mm_extra_data["image_grid_thw"] = ( + torch.cat(image_grid_thw_list, dim=0) + if len(image_grid_thw_list) > 1 + else image_grid_thw_list[0] + ) + if video_grid_thw_list: + mm_extra_data["video_grid_thw"] = ( + torch.cat(video_grid_thw_list, dim=0) + if len(video_grid_thw_list) > 1 + else video_grid_thw_list[0] + ) + + return mm_content_dict, mm_extra_data + + @torch.inference_mode() + def forward(self, multimodal_params: List[MultimodalParams]) -> List[torch.Tensor]: + mm_content_data, mm_extra_data = self._parse_and_batch_multimodal_data(multimodal_params) + pixel_values = mm_content_data.get("pixel_values", None) + pixel_values_videos = mm_content_data.get("pixel_values_videos", None) + + if pixel_values is not None and pixel_values_videos is not None: + raise ValueError("Currently only support single modality per request") + + image_grid_thw = mm_extra_data.get("image_grid_thw", None) + video_grid_thw = mm_extra_data.get("video_grid_thw", None) + + embeds = [] + if pixel_values is not None: + pixel_values = pixel_values.to(self.model_dtype) + image_embeds, deepstack_image_embeds = self.visual( + pixel_values, grid_thw=image_grid_thw + ) + # NOTE: We concatenate deepstack_embeds to mm_embeds + # The shape will be [seq_len, hidden_dim * (num_deepstack_layers + 1)] + mixed_image_embeds = torch.cat([image_embeds] + deepstack_image_embeds, dim=1) + embeds.append(mixed_image_embeds) + + if pixel_values_videos is not None: + pixel_values_videos = pixel_values_videos.to(self.model_dtype) + video_embeds, deepstack_video_embeds = self.visual( + pixel_values_videos, grid_thw=video_grid_thw + ) + # NOTE: We concatenate deepstack_embeds to mm_embeds + # The shape will be [seq_len, hidden_dim * (num_deepstack_layers + 1)] + mixed_video_embeds = torch.cat([video_embeds] + deepstack_video_embeds, dim=1) + embeds.append(mixed_video_embeds) + return embeds + + +class Qwen3VLModelBase(PreTrainedModel): + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + *args, + **kwargs, + ) -> None: + self.original_arch = model_config.pretrained_config.architectures[0] + + disable_fuse_rope = kwargs.get("disable_fuse_rope", False) + model_config.pretrained_config.text_config.disable_fuse_rope = disable_fuse_rope + model_config.pretrained_config.text_config.rope_scaling["type"] = "mrope" + config = model_config.pretrained_config + + self._supports_sdpa = True + self._supports_flash_attn = True + super().__init__(config) + if not disable_fuse_rope: + self.init_mrope_embedding(model_config) + + self.model_config = model_config + + llm_model_config = copy.deepcopy(model_config) + llm_model_config.pretrained_config = config.text_config + llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"] + self.llm = AutoModelForCausalLM.from_config(llm_model_config) + + if not _is_disagg(): + self.mm_encoder = Qwen3VisionModelBase( + model_config, kwargs.get("vision_model_class", None) + ).eval() + + self.use_deepstack = hasattr(config.vision_config, "deepstack_visual_indexes") + self.deepstack_num_level = ( + len(config.vision_config.deepstack_visual_indexes) if self.use_deepstack else 0 + ) + + self.post_config() + + def post_config(self): + # use llm.config as config for pytorch model engine + self.model_config.pretrained_config = self.llm.config + self.config = self.model_config.pretrained_config + + def infer_max_seq_len(self) -> int: + return self.llm.infer_max_seq_len() + + def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]): + config = model_config.pretrained_config.text_config + pos_embd_params = PositionalEmbeddingParams( + type=PositionEmbeddingType.from_string(config.rope_scaling["type"]), + rope=RopeParams.from_config(config), + mrope_section=config.rope_scaling.get("mrope_section", None), + mrope_interleaved=config.rope_scaling.get("mrope_interleaved", False), + ) + self.rotary_emb = MRotaryEmbedding( + pos_embd_params.rope, + head_dim=config.hidden_size // config.num_attention_heads, + is_neox=pos_embd_params.is_neox, + mrope_section=pos_embd_params.mrope_section, + mrope_interleaved=pos_embd_params.mrope_interleaved, + ).to("cuda") + self.mrope_position_ids_padding_cuda = torch.zeros( + ( + 3, + 1, + config.max_position_embeddings, + ), + dtype=torch.int32, + device="cuda", + ) + + @nvtx_range("Qwen3-VL prepare_mrope_config") + def prepare_mrope_config( + self, multimodal_params: List[MultimodalParams], num_context_requests: int + ): + mrope_config = {} + mrope_rotary_cos_sin = [] + mrope_position_deltas = [] + for multimodal_param in multimodal_params[:num_context_requests]: + if multimodal_param.multimodal_data.get("mrope_config") is not None: + with nvtx_range("Qwen3-VL get_cos_sin"): + if ( + multimodal_param.multimodal_data["mrope_config"].get("mrope_position_ids") + is not None + ): + mrope_position_ids = multimodal_param.multimodal_data["mrope_config"][ + "mrope_position_ids" + ] + + self.mrope_position_ids_padding_cuda[ + :, :, : mrope_position_ids.shape[-1] + ] = mrope_position_ids + self.mrope_position_ids_padding_cuda[ + :, :, mrope_position_ids.shape[-1] : + ] = 0 + cos, sin = self.rotary_emb.get_cos_sin(self.mrope_position_ids_padding_cuda) + concat_cos_sin = torch.stack((cos, sin), dim=-1) + concat_cos_sin = concat_cos_sin.reshape(concat_cos_sin.shape[0], -1) + mrope_rotary_cos_sin.append(concat_cos_sin) + + for multimodal_param in multimodal_params[num_context_requests:]: + if multimodal_param.multimodal_data.get("mrope_config") is not None: + if ( + multimodal_param.multimodal_data["mrope_config"].get("mrope_position_deltas") + is not None + ): + mrope_position_deltas.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"] + ) + + with nvtx_range("Qwen3-VL concat mrope_rotary_cos_sin"): + if mrope_rotary_cos_sin: + mrope_config["mrope_rotary_cos_sin"] = torch.cat(mrope_rotary_cos_sin, dim=0) + with nvtx_range("Qwen3-VL concat mrope_position_deltas"): + if mrope_position_deltas: + mrope_config["mrope_position_deltas"] = torch.cat(mrope_position_deltas, dim=0) + + return mrope_config + + def split_mm_embeds(self, mm_embed, deepstack_num_level): + num_elements = mm_embed.shape[1] // (deepstack_num_level + 1) + mm_embed_chunks = torch.split(mm_embed, [num_elements] * (deepstack_num_level + 1), dim=1) + return mm_embed_chunks[0], list(mm_embed_chunks[1:]) + + @torch.inference_mode() + def forward( + self, + attn_metadata: AttentionMetadata, + input_ids: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.IntTensor] = None, + input_embeds: Optional[torch.Tensor] = None, + return_context_logits: bool = False, + **kwargs, + ) -> torch.Tensor: + """ + VLM forward logic with inflight batching support. + """ + num_context_requests, num_generation_requests = ( + attn_metadata.num_contexts, + attn_metadata.num_generations, + ) + logger.debug( + f"num_context_requests: {num_context_requests}, num_generation_requests: {num_generation_requests}" + ) + + multimodal_params = kwargs.get("multimodal_params", []) + mm_embeds = [] + mrope_config = {} + deepstack_embeds = [] + + # NOTE: Qwen*-VL series has mrope_config even on the text-only prompts, + # so we need to separate the mm_multimodal_params from the text-only prompts. + mm_multimodal_params = [ + multimodal_param + for multimodal_param in multimodal_params + if multimodal_param.multimodal_data.get("image", {}).get("pixel_values") is not None + or multimodal_param.multimodal_data.get("video", {}).get("pixel_values_videos") + is not None + ] + if len(mm_multimodal_params) > 0: + if not _is_disagg(): + mm_embeds = get_multimodal_embeddings( + encoder_forward_fn=self.mm_encoder.forward, + multimodal_params=mm_multimodal_params, + ) + else: + raise NotImplementedError( + "Qwen3VLModel does not support disaggregated inference yet. Please unset " + "the TLLM_MULTIMODAL_DISAGGREGATED environment variable, or set it to '0'." + ) + mm_embeds = find_input_mm_embeds(mm_embeds, mm_multimodal_params) + + if self.use_deepstack: + for i, mm_embed in enumerate(mm_embeds): + mm_embed, deepstack_embed = self.split_mm_embeds( + mm_embed, self.deepstack_num_level + ) + mm_embeds[i] = mm_embed + deepstack_embeds.extend(deepstack_embed) + + if not self.model_config.pretrained_config.disable_fuse_rope: + mrope_config = self.prepare_mrope_config(multimodal_params, num_context_requests) + + result = fuse_input_embeds( + self.llm.model.embed_tokens, + input_ids, + mm_embeds, + extra_embeds=deepstack_embeds, + **kwargs, + ) + if len(deepstack_embeds) > 0: + input_ids, input_embeds, deepstack_embeds = result + else: + input_ids, input_embeds = result + + output_prob = self.llm.forward( + attn_metadata=attn_metadata, + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=input_embeds, + return_context_logits=return_context_logits, + deepstack_embeds=deepstack_embeds, + mrope_config=mrope_config, + ) + logger.debug(f"output shape: {output_prob.shape}") + return output_prob diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3vl_moe.py new file mode 100644 index 0000000000..a7a0050383 --- /dev/null +++ b/tensorrt_llm/_torch/models/modeling_qwen3vl_moe.py @@ -0,0 +1,64 @@ +from typing import Dict, List + +import torch +from transformers import PretrainedConfig + +from tensorrt_llm._torch.models.modeling_multimodal_utils import _is_disagg + +from ...inputs import ( + MultimodalPlaceholderMetadata, + MultimodalPlaceholderPlacement, + register_input_processor, +) +from .checkpoints.base_weight_mapper import BaseWeightMapper +from .checkpoints.hf.qwen3vl_moe_weight_mapper import Qwen3VLMoeHfWeightMapper +from .modeling_qwen3vl import ( + Qwen3VisionModel, + Qwen3VisionModelBase, + Qwen3VLInputProcessorBase, + Qwen3VLModelBase, +) +from .modeling_utils import ModelConfig, register_auto_model, register_vision_encoder + + +@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel) +@register_auto_model("Qwen3VLMoeForConditionalGeneration") +@register_input_processor( + Qwen3VLInputProcessorBase, + model_type="qwen3_vl_moe", + placeholder_metadata=MultimodalPlaceholderMetadata( + placeholder_map={ + "image": "<|vision_start|><|image_pad|><|vision_end|>", + "video": "<|vision_start|><|video_pad|><|vision_end|>", + }, + placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT, + ), +) +class Qwen3MoeVLModel(Qwen3VLModelBase): + def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs): + # NOTE: HF implementation. + kwargs["vision_model_class"] = Qwen3VisionModel + kwargs["disable_fuse_rope"] = kwargs.get( + "disable_fuse_rope", False + ) # TODO: Make this ModelConfig's argument + super().__init__(model_config, *args, **kwargs) + + @property + def multimodal_data_device_paths(self) -> List[str]: + return [ + "image.pixel_values", + "video.pixel_values_videos", + "multimodal_embedding", + ] + + def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper): + if not _is_disagg(): + self.mm_encoder.load_weights(weights) + + weight_mapper = Qwen3VLMoeHfWeightMapper() + weight_mapper.init_model_and_config(self.llm, self.model_config) + filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")} + params_map = { + r"^model\.language_model\.(.*)$": r"model.\1", + } + self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map) diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index 17d3aba15f..8adb412d01 100755 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -672,10 +672,12 @@ class SpecDecOneEngineForCausalLM(DecoderModelForCausalLM[TModel, TConfig], def load_weights(self, weights: Dict, weight_mapper: Optional[BaseWeightMapper] = None, + params_map: Optional[Dict[str, str]] = None, allow_partial_loading: bool = False): super().load_weights(weights=weights, weight_mapper=weight_mapper, skip_modules=["draft_model"], + params_map=params_map, allow_partial_loading=allow_partial_loading) def load_draft_weights(self, diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py index c17eacbefa..0e7503f792 100755 --- a/tensorrt_llm/_torch/models/modeling_utils.py +++ b/tensorrt_llm/_torch/models/modeling_utils.py @@ -561,6 +561,7 @@ class DecoderModelForCausalLM(nn.Module, weights: Dict, weight_mapper: Optional["BaseWeightMapper"] = None, skip_modules: List[str] = [], + params_map: Optional[Dict[str, str]] = None, allow_partial_loading: bool = False): # TODO smor- this solution is a temporary solution to load weights while we are still using # the old checkpoint format loading process. Once checkpoint format is unified @@ -570,6 +571,7 @@ class DecoderModelForCausalLM(nn.Module, _load_weights_impl(self, weights, skip_modules, + params_map=params_map, preload_weight_modules=preload_weight_modules, allow_partial_loading=allow_partial_loading) else: @@ -577,6 +579,7 @@ class DecoderModelForCausalLM(nn.Module, weights, weight_mapper, skip_modules, + params_map=params_map, preload_weight_modules=preload_weight_modules, allow_partial_loading=allow_partial_loading) diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index 383ebf8296..aec1489676 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -324,7 +324,7 @@ class Attention(nn.Module): head_dim=self.head_dim, is_neox=self.pos_embd_params.is_neox, mrope_section=self.pos_embd_params.mrope_section, - ) + mrope_interleaved=self.pos_embd_params.mrope_interleaved) else: self.rotary_emb = RotaryEmbedding( self.pos_embd_params.rope, diff --git a/tensorrt_llm/_torch/modules/qk_norm_attention.py b/tensorrt_llm/_torch/modules/qk_norm_attention.py index 771e7f79a5..f71ef4eff3 100644 --- a/tensorrt_llm/_torch/modules/qk_norm_attention.py +++ b/tensorrt_llm/_torch/modules/qk_norm_attention.py @@ -160,6 +160,7 @@ class QKNormRoPEAttention(Attention): attn_output_gate: Optional[bool] = None, is_qk_norm: bool = True, reduce_output: bool = True, + rope_fusion: bool = True, ): self.pretrained_config = config.pretrained_config @@ -170,7 +171,8 @@ class QKNormRoPEAttention(Attention): # If fuse_qk_norm_rope is true, do not apply fused RoPE in attention OP, and self.rotary_emb # will be skipped in the overridden apply_rope. - rope_fusion = not self.fuse_qk_norm_rope and not skip_rope and not attn_output_gate and not use_gemma_rms_norm + rope_fusion &= (not self.fuse_qk_norm_rope and not skip_rope + and not attn_output_gate and not use_gemma_rms_norm) self.is_qk_norm = is_qk_norm assert not (fuse_qk_norm_rope and skip_rope ), "Fusing qk norm and skipping rope is not supported" diff --git a/tensorrt_llm/_torch/modules/rotary_embedding.py b/tensorrt_llm/_torch/modules/rotary_embedding.py index bde1ff859a..2b004673eb 100644 --- a/tensorrt_llm/_torch/modules/rotary_embedding.py +++ b/tensorrt_llm/_torch/modules/rotary_embedding.py @@ -136,9 +136,22 @@ class MRotaryEmbedding(RotaryEmbedding): head_dim: int, mrope_section: List[int], is_neox: bool = True, + mrope_interleaved: bool = False, ): super().__init__(rope_params, head_dim=head_dim, is_neox=is_neox) self.mrope_section = mrope_section + self.mrope_interleaved = mrope_interleaved + + def apply_interleaved_rope(self, x: torch.Tensor) -> torch.Tensor: + # referenced from https://github.com/vllm-project/vllm/blob/aeb82b1930454498fccc7e91f7c4e0f360cf658a/vllm/model_executor/layers/rotary_embedding/mrope.py#L191 + x_t = x[0].clone() + x_t[..., + 1:self.mrope_section[1] * 3:3] = x[1, ..., + 1:self.mrope_section[1] * 3:3] + x_t[..., + 2:self.mrope_section[2] * 3:3] = x[2, ..., + 2:self.mrope_section[2] * 3:3] + return x_t def get_cos_sin( self, @@ -146,16 +159,20 @@ class MRotaryEmbedding(RotaryEmbedding): if position_ids.ndim == 3: cos_sin = self.rotary_cos_sin[position_ids.view(3, -1)] cos, sin = cos_sin[:, :, 0, :], cos_sin[:, :, 1, :] - cos = torch.cat([ - m[i] - for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) - ], - dim=-1) - sin = torch.cat([ - m[i] - for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) - ], - dim=-1) + if self.mrope_interleaved: + cos = self.apply_interleaved_rope(cos) + sin = self.apply_interleaved_rope(sin) + else: + cos = torch.cat([ + m[i] + for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) + ], + dim=-1) + sin = torch.cat([ + m[i] + for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) + ], + dim=-1) else: # Fallback to the original RoPE where position_ids is 2D for dummy requests cos_sin = self.rotary_cos_sin[position_ids.view(-1)] diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py index 14bd727d9c..187566f62e 100644 --- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py +++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py @@ -319,7 +319,7 @@ class CUDAGraphRunner: } if self.config.use_mrope: sliced_static_tensors["position_ids"] = self.shared_static_tensors[ - "position_ids"][:, :, :num_tokens_for_capture], + "position_ids"][:, :, :num_tokens_for_capture] sliced_static_tensors[ "multimodal_params"] = self.shared_static_tensors[ "multimodal_params"][:batch_size * self.max_beam_width] diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py index 3906fbe274..bf948eb250 100644 --- a/tensorrt_llm/tools/multimodal_builder.py +++ b/tensorrt_llm/tools/multimodal_builder.py @@ -590,7 +590,7 @@ def build_llava_engine(args): model = LlavaOnevisionForConditionalGeneration.from_pretrained( args.model_path, dtype=torch.float16) wrapper = LlavaOnevisionVisionWrapper( - model.vision_tower.vision_model.to(args.device), + model.vision_tower.to(args.device), model.multi_modal_projector.to(args.device), model.config) export_onnx(wrapper, image, f'{args.output_dir}/onnx') diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index 6a8cc12d00..0965e87352 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -19,3 +19,5 @@ nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16: - accuracy: 26.67 microsoft/Phi-4-multimodal-instruct: - accuracy: 53.67 +Qwen/Qwen3-VL-30B-A3B-Instruct: + - accuracy: 55.33 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index 194715ed29..7aeefd433c 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -245,3 +245,21 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness): ) as llm: task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) + + +class TestQwen3VL_MOE(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen/Qwen3-VL-30B-A3B-Instruct" + MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-VL-30B-A3B-Instruct" + MAX_NUM_TOKENS = 16384 + + sampling_params = SamplingParams( + max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<|endoftext|>" + ) + + def test_auto_dtype(self): + with LLM( + self.MODEL_PATH, + max_num_tokens=self.MAX_NUM_TOKENS, + ) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 776ef654ef..79ac009326 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -656,6 +656,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index c9dfda7070..e73d633299 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -21,6 +21,7 @@ l0_l40s: - unittest/_torch/modeling -k "modeling_phi4mm" - unittest/_torch/modeling/test_modeling_llava_next.py::TestLlavaNext::test_all - unittest/_torch/modeling/test_modeling_qwen2_5vl.py::TestQwen2_5_VL::test_all + - unittest/_torch/modeling/test_modeling_qwen3vl_moe.py::TestQwen3VLMoe::test_all - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py index af28829e73..a1879ed30a 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py @@ -1,3 +1,4 @@ +import pytest import torch from _model_test_utils import get_small_model_config from build_and_run_ad import ExperimentConfig @@ -9,6 +10,10 @@ from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device def test_build_run_llama4_vlm(): + pytest.skip( + "Skipping test_build_run_llm4_vlm because Llama4 is giving an error on upgrading transformers version to 4.57.1" + "https://nvbugspro.nvidia.com/bug/5732942" + ) atol = 1e-3 rtol = 1e-3 diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py index 79457fbfca..4b2c75f29d 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py @@ -201,6 +201,19 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs): ], ) def test_build_ad(model_hub_id: str, llm_extra_args: dict): + if ( + model_hub_id == "mistralai/Mixtral-8x7B-Instruct-v0.1" + and llm_extra_args.get("mode") != "transformers" + ): + pytest.skip( + "Mixtral-8x7B-Instruct-v0.1 is giving an error on upgrading transformers version to 4.57.1" + "https://nvbugspro.nvidia.com/bug/5732942" + ) + if model_hub_id == "Qwen/Qwen3-30B-A3B" and llm_extra_args.get("mode") != "transformers": + pytest.skip( + "Qwen3-30B-A3B is giving an error on upgrading transformers version to 4.57.1" + "https://nvbugspro.nvidia.com/bug/5732942" + ) experiment_config = get_small_model_config(model_hub_id, **llm_extra_args) experiment_config["args"]["runtime"] = "demollm" # Default runtime set to demollm experiment_config["args"]["world_size"] = 0 # Default world_size set to 0 diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py index 941b15890e..599b1be021 100644 --- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py +++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py @@ -271,7 +271,10 @@ class TestLlama4MinLatency(unittest.TestCase): "The transformers between 4.55.0 and 4.56.1 have accuracy " "issues for Llama4. See: " "https://github.com/huggingface/transformers/pull/40609") - + elif transformers.__version__ >= "4.57.1": + self.skipTest( + "Bumping transformers version to 4.57.1 has accuracy issues for Llama4. See: " + "http://nvbugs/5732958") torch.random.manual_seed(0) config_dict = deepcopy(LLAMA_4_MAVERICK_TWO_LAYER_CONFIG) # 17B * sizeof(float16) plus some extra for activations diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py index 18c0c2634b..b65dfe8537 100644 --- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py +++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py @@ -185,6 +185,12 @@ class TestModelingMultimodal(unittest.TestCase, ABC): else: model.load_weights(hf_model_state_dict) + for module in model.modules(): + if hasattr(module, "post_load_weights") and not getattr( + module, "_weights_removed", False + ): + module.post_load_weights() + return model, model_config def create_hf_model(self, pretrained_config: PretrainedConfig) -> PreTrainedModel: @@ -457,7 +463,7 @@ class TestModelingMultimodal(unittest.TestCase, ABC): "attn_metadata" ].create_cuda_graph_metadata(1) - # Prepare metadata before capture (like in working Qwen2.5-VL test) + # Prepare metadata before capture trtllm_inputs["attn_metadata"].prepare() key = (1, 0, False) diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py index 56f71d2bad..f36b0e3542 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py @@ -187,7 +187,7 @@ class TestQwen2_5_VL(TestModelingMultimodal): return self.trtllm_model.forward(**trtllm_inputs) else: # NOTE: Qwen2.5-VL model uses mrope - graph_runner = create_mock_cuda_graph_runner(1, True) + graph_runner = create_mock_cuda_graph_runner(1, use_mrope=True) trtllm_inputs["attn_metadata"] = trtllm_inputs[ "attn_metadata"].create_cuda_graph_metadata(1) @@ -232,13 +232,6 @@ class TestQwen2_5_VL(TestModelingMultimodal): chunked_prefill=False, kv_cache_reuse=False), - # ==== Disable fuse rope scenarios ==== - TestQwen2_5_VLScenario(modality="image", - use_cuda_graph=False, - disable_fuse_rope=True, - chunked_prefill=False, - kv_cache_reuse=False), - # ==== Chunked Prefill Scenarios ==== TestQwen2_5_VLScenario(modality="image", use_cuda_graph=False, @@ -252,6 +245,13 @@ class TestQwen2_5_VL(TestModelingMultimodal): disable_fuse_rope=False, chunked_prefill=False, kv_cache_reuse=True), + + # ==== Disable fuse rope scenarios ==== + TestQwen2_5_VLScenario(modality="image", + use_cuda_graph=False, + disable_fuse_rope=True, + chunked_prefill=False, + kv_cache_reuse=False), ] return scenarios diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3vl_moe.py new file mode 100644 index 0000000000..c6d4080e7b --- /dev/null +++ b/tests/unittest/_torch/modeling/test_modeling_qwen3vl_moe.py @@ -0,0 +1,283 @@ +import os +from dataclasses import dataclass +from typing import List + +import torch +from _torch.helpers import create_mock_cuda_graph_runner +from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal +from transformers import Qwen3VLMoeConfig +from transformers import Qwen3VLMoeForConditionalGeneration as HFQwen3VLMoeForConditionalGeneration +from utils.llm_data import llm_models_root + +from tensorrt_llm._torch.models.checkpoints.hf.qwen3vl_moe_weight_mapper import ( + Qwen3VLMoeHfWeightMapper, +) +from tensorrt_llm._torch.models.modeling_qwen3vl_moe import Qwen3MoeVLModel + +QWEN3_VL_30B_A3B_CONFIG = { + "architectures": ["Qwen3VLMoeForConditionalGeneration"], + "image_token_id": 151655, + "model_type": "qwen3_vl_moe", + "text_config": { + "attention_bias": False, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "decoder_sparse_step": 1, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 262144, + "mlp_only_layers": [], + "model_type": "qwen3_vl_moe_text", + "moe_intermediate_size": 768, + "norm_topk_prob": True, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 2, # NOTE: Only 2 layer for testing, 48 layers for full model + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_interleaved": True, + "mrope_section": [24, 20, 20], + "rope_type": "default", + }, + "rope_theta": 5000000, + "use_cache": True, + "vocab_size": 151936, + }, + "tie_word_embeddings": False, + "transformers_version": "4.57.0.dev0", + "video_token_id": 151656, + "vision_config": { + "deepstack_visual_indexes": [8, 16, 24], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 4304, + "model_type": "qwen3_vl_moe", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 2048, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "_attn_implementation": "flash_attention_2", + "_name_or_path": str(os.path.join(llm_models_root(), "Qwen3", "Qwen3-VL-30B-A3B-Instruct")), +} + + +@dataclass(repr=False) +class TestQwen3VLMoeScenario(MultimodalScenario): + disable_fuse_rope: bool = False + + def __repr__(self) -> str: + """Generate a human-readable string representation of the scenario.""" + features = [] + features.append(f"modality:{self.modality.lower()}") + if self.use_cuda_graph: + features.append("cuda_graph") + if self.disable_fuse_rope: + features.append("no_fuse_rope") + if self.chunked_prefill: + features.append("chunked_prefill") + if self.kv_cache_reuse: + features.append("kv_cache_reuse") + return "-".join(features) + + +class TestQwen3VLMoe(TestModelingMultimodal): + def get_model_config(self): + """Return the model configuration dictionary.""" + return QWEN3_VL_30B_A3B_CONFIG + + def get_trtllm_model_class(self): + return Qwen3MoeVLModel + + def get_hf_model_class(self): + return HFQwen3VLMoeForConditionalGeneration + + def get_weight_mapper_class(self): + return Qwen3VLMoeHfWeightMapper + + def get_model_type(self): + return "qwen3_vl_moe" + + def get_model_config_class(self): + return Qwen3VLMoeConfig + + def get_trtllm_inputs( + self, + input_ids, + multimodal_params_list, + is_gen: bool = False, + num_cached_tokens_per_seq: List[int] = None, + ): + trtllm_inputs = super().get_trtllm_inputs( + input_ids, multimodal_params_list, is_gen, num_cached_tokens_per_seq + ) + + if is_gen: + mrope_gen_position_ids = [] + for multimodal_param in multimodal_params_list: + mrope_gen_position_ids.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_deltas"] + ) + mrope_gen_position_ids = torch.cat(mrope_gen_position_ids, dim=-1).to(self.device) + trtllm_inputs["position_ids"] = ( + (trtllm_inputs["position_ids"] + mrope_gen_position_ids).expand(3, -1, 1).cuda() + ) + gen_multimodal_params_list = [] + for multimodal_param in multimodal_params_list: + multimodal_param.strip_for_generation() + multimodal_param.to_device( + "multimodal_data", + self.device, + pin_memory=True, + target_keywords=["mrope_config.mrope_position_deltas"], + ) + gen_multimodal_params_list.append(multimodal_param) + trtllm_inputs["multimodal_params"] = gen_multimodal_params_list + else: + # Mrope position ids + mrope_position_ids = [] + for multimodal_param in multimodal_params_list: + mrope_position_ids.append( + multimodal_param.multimodal_data["mrope_config"]["mrope_position_ids"] + ) + position_ids = torch.cat(mrope_position_ids, dim=-1) + position_ids = position_ids.cuda() + trtllm_inputs["position_ids"] = position_ids + + return trtllm_inputs + + def init_kv_cache_manager(self, scenario: TestQwen3VLMoeScenario): + # NOTE: Exactly the same as the parent class method, + # but with the mrope flag set to True for Qwen2.5-VL model. + cache_config = self.get_kv_cache_config(scenario) + tokens_per_block = cache_config["tokens_per_block"] + max_seq_len = cache_config["max_seq_len"] + batch_size = cache_config["batch_size"] + + num_blocks = (max_seq_len + tokens_per_block - 1) // tokens_per_block + + self.kv_cache_manager = self.get_kv_cache_manager( + dtype=self.model_config.pretrained_config.torch_dtype, + config=self.model_config.pretrained_config, + tokens_per_block=tokens_per_block, + max_seq_len=max_seq_len, + batch_size=batch_size, + num_blocks=num_blocks, + ) + + self.kv_cache_manager.add_dummy_requests( + request_ids=[1], + token_nums=[max_seq_len], + # NOTE: Qwen2.5-VL model uses mrope + use_mrope=True, + ) + + def run_trtllm_forward(self, trtllm_inputs, use_cuda_graph: bool = False): + # NOTE: Exactly the same as the parent class method, + # but with the mrope flag set to True for Qwen2.5-VL model. + if not use_cuda_graph: + trtllm_inputs["attn_metadata"].prepare() + return self.trtllm_model.forward(**trtllm_inputs) + else: + # NOTE: Qwen2.5-VL model uses mrope + graph_runner = create_mock_cuda_graph_runner(1, True) + trtllm_inputs["attn_metadata"] = trtllm_inputs[ + "attn_metadata" + ].create_cuda_graph_metadata(1) + + # Prepare metadata before capture (like in working Qwen2.5-VL test) + trtllm_inputs["attn_metadata"].prepare() + + key = (1, 0, False) + graph_runner.capture( + key=key, + forward_fn=lambda inputs: self.trtllm_model.forward(**inputs), + initial_inputs=trtllm_inputs, + ) + for _ in range(2): + # Run it twice. This helps us catch problems if buffers are accidentally reallocated in prepare(). + trtllm_inputs["attn_metadata"].prepare() + logits = graph_runner.replay(key=key, current_inputs=trtllm_inputs) + return logits.clone() + + def get_scenarios(self) -> List[TestQwen3VLMoeScenario]: + scenarios = [ + # ==== Modality Sanity Checks ==== + TestQwen3VLMoeScenario( + modality="image", + use_cuda_graph=False, + disable_fuse_rope=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + TestQwen3VLMoeScenario( + modality="video", + use_cuda_graph=False, + disable_fuse_rope=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + TestQwen3VLMoeScenario( + modality="multiple_image", + use_cuda_graph=False, + disable_fuse_rope=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + # ==== CUDA Graph Scenarios ==== + TestQwen3VLMoeScenario( + modality="image", + use_cuda_graph=True, + disable_fuse_rope=False, + chunked_prefill=False, + kv_cache_reuse=False, + ), + # ==== Chunked Prefill Scenarios ==== + TestQwen3VLMoeScenario( + modality="image", + use_cuda_graph=False, + disable_fuse_rope=False, + chunked_prefill=True, + kv_cache_reuse=False, + ), + # ==== KV Cache Reuse Scenarios ==== + TestQwen3VLMoeScenario( + modality="image", + use_cuda_graph=False, + disable_fuse_rope=False, + chunked_prefill=False, + kv_cache_reuse=True, + ), + # ==== Disable fuse rope scenarios ==== + TestQwen3VLMoeScenario( + modality="image", + use_cuda_graph=False, + disable_fuse_rope=True, + chunked_prefill=False, + kv_cache_reuse=False, + ), + ] + return scenarios + + def setup_scenario(self, scenario: TestQwen3VLMoeScenario): + super().setup_scenario(scenario) + if scenario.disable_fuse_rope: + self.trtllm_model, self.model_config = self.create_trtllm_model( + load_weights=True, + hf_model_state_dict=self.hf_model.state_dict(), + disable_fuse_rope=True, + ) diff --git a/tests/unittest/_torch/modeling/test_modeling_siglip.py b/tests/unittest/_torch/modeling/test_modeling_siglip.py index de80efa1f4..40a7dd1399 100644 --- a/tests/unittest/_torch/modeling/test_modeling_siglip.py +++ b/tests/unittest/_torch/modeling/test_modeling_siglip.py @@ -106,7 +106,8 @@ class TestSiglipVisionModel(unittest.TestCase): attn_backend=backend, ) - tllm_model = SiglipVisionModel(model_config).to(dtype).to(device) + tllm_model = SiglipVisionModel( + model_config, use_post_layernorm=True).to(dtype).to(device) tllm_model.load_weights(hf_model.state_dict()) # Prepare inputs - create random pixel values for images diff --git a/triton_backend/requirements.txt b/triton_backend/requirements.txt index 5057b551f1..7daa868ed4 100644 --- a/triton_backend/requirements.txt +++ b/triton_backend/requirements.txt @@ -1,7 +1,8 @@ regex fire tritonclient[all] -transformers==4.56.0 +transformers==4.57.1 pandas tabulate flash_attn +torchao>=0.14.1 From ce7a42f4cf5bac4afea56f417d20f501bec571cd Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Tue, 16 Dec 2025 06:30:24 +0200 Subject: [PATCH 163/172] [https://nvbugs/5731717][fix] fixed flashinfer build race condition during test (#9983) Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- tests/integration/defs/test_unittests.py | 16 ++++- .../multigpu/test_ad_allreduce_strategies.py | 68 +++++++++++++++++++ .../test_allreduce_residual_rmsnorm_fusion.py | 3 + 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/test_unittests.py b/tests/integration/defs/test_unittests.py index 4df5deb539..190ea5111e 100644 --- a/tests/integration/defs/test_unittests.py +++ b/tests/integration/defs/test_unittests.py @@ -125,7 +125,7 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request): f'results-sub-unittests-{case_fn}.xml') command = [ - '-m', 'pytest', ignore_opt, "-v", "--timeout=2400", + '-m', 'pytest', ignore_opt, "-v", "--tb=short", "-rF", "--timeout=2400", "--timeout-method=thread" ] if test_prefix: @@ -153,7 +153,19 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request): cwd=test_root, env=env, ) - except CalledProcessError: + except CalledProcessError as e: + print(f"\n{'='*60}") + print(f"UNITTEST FAILED with exit code: {e.returncode}") + print(f"Command: {' '.join(cmd)}") + if hasattr(e, 'stdout') and e.stdout: + print( + f"STDOUT:\n{e.stdout.decode() if isinstance(e.stdout, bytes) else e.stdout}" + ) + if hasattr(e, 'stderr') and e.stderr: + print( + f"STDERR:\n{e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr}" + ) + print(f"{'='*60}\n") return False return True diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py index 0c386330af..9d4e444e4d 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py @@ -23,6 +23,9 @@ from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op from tensorrt_llm.commands.bench import main from tensorrt_llm.functional import AllReduceStrategy +# needed since LLM API uses MPI executor pool internally for TP>1, which leaks a thread on shutdown +pytestmark = pytest.mark.threadleak(enabled=False) + class TimeoutError(Exception): """Exception raised when a test times out.""" @@ -55,6 +58,71 @@ def timeout(seconds): signal.signal(signal.SIGALRM, old_handler) +@pytest.fixture(scope="module", autouse=True) +def prewarm_flashinfer_jit(): + """Pre-warm FlashInfer JIT kernels before multi-GPU tests. + + This prevents a race condition where multiple MPI ranks try to JIT-compile + FlashInfer kernels simultaneously to the same cache directory, causing + Ninja build failures like: "ninja: error: opening build log: No such file or directory" + + By triggering the compilation in the main process first, the kernels are + cached and available for all worker ranks. + """ + try: + import flashinfer + import flashinfer.page + import flashinfer.sampling + + if torch.cuda.is_available(): + # Prevent concurrent JIT warmup across multiple pytest processes (e.g., xdist). + try: + import fcntl # Linux-only + except ImportError: + fcntl = None + + lock_f = None + if fcntl is not None: + import pathlib + import tempfile + + lock_path = pathlib.Path(tempfile.gettempdir()) / "flashinfer_jit_prewarm.lock" + lock_f = open(lock_path, "w") + fcntl.flock(lock_f.fileno(), fcntl.LOCK_EX) + # Create dummy tensors to trigger kernel JIT compilation + with torch.no_grad(): + device = torch.device("cuda:0") + + # Trigger page kernel compilation + try: + # Force module loading (this triggers JIT compilation) + _ = flashinfer.page.gen_page_module() + except Exception as exc: # noqa: BLE001 + import warnings + + warnings.warn(f"FlashInfer page-kernel prewarm failed: {exc!r}", RuntimeWarning) + + # Trigger sampling kernel compilation + try: + dummy_probs = torch.softmax(torch.randn(1, 100, device=device), dim=-1) + _ = flashinfer.sampling.sampling_from_probs(dummy_probs, deterministic=True) + except Exception as exc: # noqa: BLE001 + import warnings + + warnings.warn( + f"FlashInfer sampling-kernel prewarm failed: {exc!r}", RuntimeWarning + ) + + torch.cuda.empty_cache() + if lock_f is not None: + lock_f.close() + + except ImportError: + pass # FlashInfer not available + + yield + + @pytest.fixture(scope="module") def shared_dataset(llm_root): # noqa: F811 """Prepare dataset once for all tests in this module.""" diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py index 408601bc68..f9595cde7f 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py @@ -12,6 +12,9 @@ from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimiz from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op from tensorrt_llm.llmapi.mpi_session import MpiPoolSession +# needed since MPI executor pool leaks a thread (_manager_spawn) on shutdown +pytestmark = pytest.mark.threadleak(enabled=False) + class RMSNorm(torch.nn.Module): """Implementation of LlamaRMSNorm.""" From 8af51211c1e9a3917aeb92f1e249234a11f3fbd5 Mon Sep 17 00:00:00 2001 From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Date: Tue, 16 Dec 2025 12:41:17 +0800 Subject: [PATCH 164/172] [FMDL-1222][feat] Support weight and weight_scale padding for NVFP4 MoE cutlass (#9358) Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> --- .../_torch/modules/fused_moe/interface.py | 4 + .../_torch/modules/fused_moe/quantization.py | 221 ++++++++++++++---- .../unittest/_torch/modules/test_fused_moe.py | 122 ++++++++++ 3 files changed, 298 insertions(+), 49 deletions(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py index ca1e134bf9..e415d0cc1b 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/interface.py +++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py @@ -719,6 +719,10 @@ class MoE(nn.Module): """ return False + @property + def expand_intermediate_size_per_partition(self): + return self.intermediate_size_per_partition * self.intermediate_size_expand_ratio + def reducescatter_or_allreduce( self, inputs, diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index 36175e5212..55de1a7e5d 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -219,9 +219,9 @@ class FusedMoEMethodBase(ABC): # bias if module.bias: if w3_w1_bias_shape is None: - w3_w1_bias_shape = (module.expert_size_per_partition, - module.intermediate_size_per_partition * - module.intermediate_size_expand_ratio) + w3_w1_bias_shape = ( + module.expert_size_per_partition, + module.expand_intermediate_size_per_partition) if w2_bias_shape is None: w2_bias_shape = (module.expert_size_per_partition, module.hidden_size) @@ -515,8 +515,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase): def create_weights(self, module: torch.nn.Module): weight_dtype = module.dtype w3_w1_weight_shape = (module.expert_size_per_partition, - module.intermediate_size_per_partition * - module.intermediate_size_expand_ratio, + module.expand_intermediate_size_per_partition, module.hidden_size) w2_weight_shape = ( module.expert_size_per_partition, @@ -581,7 +580,7 @@ def requantize_expert_w3_w1_weight_fp8_qdq(module: torch.nn.Module, w3_weight_scale = w3_weight_scale[...].reshape([]) max_w3_w1_weight_scale = max(w1_weight_scale, w3_weight_scale) - split_length = module.intermediate_size_per_partition * module.intermediate_size_expand_ratio // 2 + split_length = module.expand_intermediate_size_per_partition // 2 w3_weight = dst_w3_w1_weight.narrow( dim=0, start=0, length=split_length).to(dtype=module.dtype) w1_weight = dst_w3_w1_weight.narrow( @@ -605,8 +604,7 @@ class FP8QDQFusedMoEMethod(FusedMoEMethodBase): weight_dtype = torch.float8_e4m3fn w3_w1_weight_shape = (module.expert_size_per_partition, - module.intermediate_size_per_partition * - module.intermediate_size_expand_ratio, + module.expand_intermediate_size_per_partition, module.hidden_size) w2_weight_shape = ( module.expert_size_per_partition, @@ -1655,6 +1653,38 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase): Base class for NVFP4 fused MoE methods for all backends. """ + def get_weights_shapes(self, module: torch.nn.Module, weight_vec_size: int, + block_scales_vec_size: int): + # Divide by 16 because we use int64 to pack 16 fp4 values + w3_w1_weight_shape = (module.expert_size_per_partition, + module.expand_intermediate_size_per_partition, + module.hidden_size // weight_vec_size) + w2_weight_shape = (module.expert_size_per_partition, module.hidden_size, + module.intermediate_size_per_partition // + weight_vec_size) + + w3_w1_weight_scale_shape = ( + module.expert_size_per_partition, + module.expand_intermediate_size_per_partition, module.hidden_size // + module.scaling_vector_size // block_scales_vec_size) + w2_weight_scale_shape = (module.expert_size_per_partition, + module.hidden_size, + module.intermediate_size_per_partition // + module.scaling_vector_size // + block_scales_vec_size) + + if module.bias: + w3_w1_bias_shape = (module.expert_size_per_partition, + module.expand_intermediate_size_per_partition) + w2_bias_shape = (module.expert_size_per_partition, + module.hidden_size) + else: + w3_w1_bias_shape = None + w2_bias_shape = None + + return (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape, + w2_bias_shape, w3_w1_weight_scale_shape, w2_weight_scale_shape) + def create_weights(self, module: torch.nn.Module, weight_dtype, @@ -1664,35 +1694,23 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase): scaling_vector_size=16): module.scaling_vector_size = scaling_vector_size - # Divide by 16 because we use int64 to pack 16 fp4 values - w3_w1_weight_shape = (module.expert_size_per_partition, - module.intermediate_size_per_partition * - module.intermediate_size_expand_ratio, - module.hidden_size // weight_vec_size) - w2_weight_shape = (module.expert_size_per_partition, module.hidden_size, - module.intermediate_size_per_partition // - weight_vec_size) + + (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape, w2_bias_shape, + w3_w1_weight_scale_shape, + w2_weight_scale_shape) = self.get_weights_shapes( + module, weight_vec_size, block_scales_vec_size) # Divide by 4 because we use int32 to pack 4 fp8 values # column parallel - w3_w1_weight_scale = nn.Parameter( - torch.ones(module.expert_size_per_partition, - module.intermediate_size_per_partition * - module.intermediate_size_expand_ratio, - module.hidden_size // module.scaling_vector_size // - block_scales_vec_size, - dtype=block_scales_dtype), - requires_grad=False) + w3_w1_weight_scale = nn.Parameter(torch.ones(w3_w1_weight_scale_shape, + dtype=block_scales_dtype), + requires_grad=False) module.register_parameter("w3_w1_weight_scale", w3_w1_weight_scale) # row parallel - w2_weight_scale = nn.Parameter( - torch.ones(module.expert_size_per_partition, - module.hidden_size, - module.intermediate_size_per_partition // - module.scaling_vector_size // block_scales_vec_size, - dtype=block_scales_dtype), - requires_grad=False) + w2_weight_scale = nn.Parameter(torch.ones(w2_weight_scale_shape, + dtype=block_scales_dtype), + requires_grad=False) module.register_parameter("w2_weight_scale", w2_weight_scale) fc31_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32), @@ -1717,8 +1735,12 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase): # This will be initialized in load_quant_scales if pre_quant_scale exists module.register_parameter("fc31_act_scale", None) - super().create_weights(module, weight_dtype, w3_w1_weight_shape, - w2_weight_shape) + super().create_weights(module, + weight_dtype, + w3_w1_weight_shape=w3_w1_weight_shape, + w2_weight_shape=w2_weight_shape, + w3_w1_bias_shape=w3_w1_bias_shape, + w2_bias_shape=w2_bias_shape) self.setup_quant_scales(module) @@ -2005,6 +2027,55 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase): class NVFP4CutlassFusedMoEMethod(NVFP4FusedMoEMethod): weight_dtype = FUSED_MOE_NVFP4_WEIGHT_DTYPE block_scales_dtype = FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE + NVFP4_ROW_ALIGNMENT = 128 + NVFP4_COL_ALIGNMENT = 4 + + def get_weights_shapes(self, module: torch.nn.Module, weight_vec_size: int, + block_scales_vec_size: int): + """Override the base method to get aligned weights shapes for Cutlass nvfp4 alignment.""" + intermediate_size_expand_aligned = ( + module.expand_intermediate_size_per_partition + + self.NVFP4_ROW_ALIGNMENT - + 1) // self.NVFP4_ROW_ALIGNMENT * self.NVFP4_ROW_ALIGNMENT + + if module.hidden_size % self.NVFP4_COL_ALIGNMENT != 0: + raise ValueError( + f"hidden_size {module.hidden_size} must be divisible by {self.NVFP4_COL_ALIGNMENT}" + ) + hidden_size_aligned = module.hidden_size + + w3_w1_weight_shape = (module.expert_size_per_partition, + intermediate_size_expand_aligned, + hidden_size_aligned // weight_vec_size) + w2_weight_shape = (module.expert_size_per_partition, + hidden_size_aligned, + intermediate_size_expand_aligned // + module.intermediate_size_expand_ratio // + weight_vec_size) + + w3_w1_weight_scale_shape = (module.expert_size_per_partition, + intermediate_size_expand_aligned, + hidden_size_aligned // + module.scaling_vector_size // + block_scales_vec_size) + w2_weight_scale_shape = (module.expert_size_per_partition, + hidden_size_aligned, + intermediate_size_expand_aligned // + module.intermediate_size_expand_ratio // + module.scaling_vector_size // + block_scales_vec_size) + + if module.bias: + w3_w1_bias_shape = (module.expert_size_per_partition, + intermediate_size_expand_aligned) + w2_bias_shape = (module.expert_size_per_partition, + hidden_size_aligned) + else: + w3_w1_bias_shape = None + w2_bias_shape = None + + return (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape, + w2_bias_shape, w3_w1_weight_scale_shape, w2_weight_scale_shape) def create_weights(self, module: torch.nn.Module): weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4 @@ -2029,21 +2100,16 @@ class NVFP4CutlassFusedMoEMethod(NVFP4FusedMoEMethod): module.tp_rank, TensorParallelMode.COLUMN, device=device) - # Keep weights in device buffer - # w3 - split_length = module.intermediate_size_per_partition * module.intermediate_size_expand_ratio // 2 - dst_w3_weight_scale = dst_w3_w1_weight_scale.narrow(dim=0, - start=0, - length=split_length) - dst_w3_weight_scale.copy_( - w3_weight_scale.view(dst_w3_weight_scale.dtype)) - # w1 - dst_w1_weight_scale = dst_w3_w1_weight_scale.narrow(dim=0, - start=split_length, - length=split_length) - dst_w1_weight_scale.copy_( - w1_weight_scale.view(dst_w1_weight_scale.dtype)) + cast_w3_weight_scale = w3_weight_scale.view( + dst_w3_w1_weight_scale.dtype) + cast_w1_weight_scale = w1_weight_scale.view( + dst_w3_w1_weight_scale.dtype) + cast_w31_weight_scale = torch.cat( + [cast_w3_weight_scale, cast_w1_weight_scale], dim=0) + cast_w31_weight_scale = self._maybe_padding_shape( + cast_w31_weight_scale, dst_w3_w1_weight_scale) + dst_w3_w1_weight_scale.copy_(cast_w31_weight_scale) orig_shape = dst_w3_w1_weight_scale.shape @@ -2065,9 +2131,12 @@ class NVFP4CutlassFusedMoEMethod(NVFP4FusedMoEMethod): module.tp_rank, TensorParallelMode.ROW, device=device) + + cast_w2_weight_scale = w2_weight_scale.view(dst_w2_weight_scale.dtype) + cast_w2_weight_scale = self._maybe_padding_shape( + cast_w2_weight_scale, dst_w2_weight_scale) # Keep weights in device buffer - dst_w2_weight_scale.copy_( - w2_weight_scale.view(dst_w2_weight_scale.dtype)) + dst_w2_weight_scale.copy_(cast_w2_weight_scale) orig_shape = dst_w2_weight_scale.shape @@ -2079,6 +2148,60 @@ class NVFP4CutlassFusedMoEMethod(NVFP4FusedMoEMethod): dst_w2_weight_scale.copy_(dst_w2_weight_scale_interleaved) + def load_expert_w3_w1_weight(self, module: torch.nn.Module, + w1_weight: torch.Tensor, + w3_weight: torch.Tensor, + dst_w3_w1_weight: torch.Tensor): + """Load and pad w1 and w3 weights for each expert, to match shape requirements for Cutlass nvfp4 alignment.""" + device = dst_w3_w1_weight.device + w1_weight_shard = load_weight_shard(w1_weight, + module.tp_size, + module.tp_rank, + TensorParallelMode.COLUMN, + device=device) + w3_weight_shard = load_weight_shard(w3_weight, + module.tp_size, + module.tp_rank, + TensorParallelMode.COLUMN, + device=device) + + cast_w1_weight_shard = w1_weight_shard.view(dst_w3_w1_weight.dtype) + cast_w3_weight_shard = w3_weight_shard.view(dst_w3_w1_weight.dtype) + cast_w31_weight_shard = torch.cat( + [cast_w3_weight_shard, cast_w1_weight_shard], dim=0) + cast_w31_weight_shard = self._maybe_padding_shape( + cast_w31_weight_shard, dst_w3_w1_weight) + dst_w3_w1_weight.copy_(cast_w31_weight_shard, non_blocking=True) + + def load_expert_w2_weight(self, module: torch.nn.Module, + w2_weight: torch.Tensor, + dst_w2_weight: torch.Tensor): + """Load and pad w2 weight for each expert, to match shape requirements for Cutlass nvfp4 alignment.""" + device = dst_w2_weight.device + w2_weight_shard = load_weight_shard(w2_weight, + module.tp_size, + module.tp_rank, + TensorParallelMode.ROW, + device=device) + cast_w2_weight_shard = w2_weight_shard.view(dst_w2_weight.dtype) + cast_w2_weight_shard = self._maybe_padding_shape( + cast_w2_weight_shard, dst_w2_weight) + dst_w2_weight.copy_(cast_w2_weight_shard, non_blocking=True) + + def _maybe_padding_shape(self, source_tensor, dst_tensor): + """Pad the source tensor to match the shape of the destination tensor.""" + # In `get_weights_shapes` method, the shape of `weights` and `weight_scales` might be tuned to align with `NVFP4_ROW_ALIGNMENT`. + # Padding the `source_tensor` to match the shape of `dst_tensor` here. + assert len(source_tensor.shape) == 2 and len( + dst_tensor.shape) == 2, "Only support 2D weights padding for now." + dst_row, dst_col = dst_tensor.shape + _row, _col = source_tensor.shape + if _row != dst_row or _col != dst_col: + source_tensor = torch.nn.functional.pad( + source_tensor, (0, dst_col - _col, 0, dst_row - _row), + "constant", 0).contiguous() + return source_tensor + class NVFP4CuteDslFusedMoEMethod(NVFP4CutlassFusedMoEMethod): diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index c66b136910..14210fb9a1 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -37,6 +37,8 @@ from tensorrt_llm._torch.modules.fused_moe import ( BaseMoeRoutingMethod, CutlassFusedMoE, TRTLLMGenFusedMoE, DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod, TritonFusedMoE, create_moe, WideEPMoE) +from tensorrt_llm._torch.modules.fused_moe.quantization import \ + NVFP4CutlassFusedMoEMethod # isort: on from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ IS_TRITON_KERNELS_AVAILABLE @@ -2792,3 +2794,123 @@ class RefGatedMLPFusedMoE(nn.Module): self.experts[expert].gate_up_proj.load_weights(gate_up_proj_weights) self.experts[expert].down_proj.load_weights(down_proj_weights) + + +# Create a mock module with required attributes for NVFP4CutlassFusedMoEMethod.get_weights_shapes test. +class MockModule: + + def __init__(self, hidden_size, intermediate_size, expand_ratio, + expert_size, bias): + self.hidden_size = hidden_size + self.intermediate_size_per_partition = intermediate_size + self.intermediate_size_expand_ratio = expand_ratio + self.expand_intermediate_size_per_partition = intermediate_size * self.intermediate_size_expand_ratio + self.expert_size_per_partition = expert_size + self.bias = bias + # Constants for NVFP4. + self.scaling_vector_size = 16 # Standard for NVFP4 + self.weight_vec_size = 16 # 16 fp4 values packed into int64 + self.block_scales_vec_size = 4 # 4 fp8 values packed into int32 + + +def test_nvfp4_cutlass_get_weights_shapes_error_cases(): + """Test NVFP4CutlassFusedMoEMethod.get_weights_shapes for error cases.""" + method = NVFP4CutlassFusedMoEMethod() + module = MockModule(hidden_size=13, + intermediate_size=16, + expand_ratio=1, + expert_size=4, + bias=False) + with pytest.raises(ValueError, + match="hidden_size 13 must be divisible by 4"): + method.get_weights_shapes(module, module.weight_vec_size, + module.block_scales_vec_size) + + +@pytest.mark.parametrize( + "hidden_size, intermediate_size, expand_ratio, expert_size, bias", [ + (512, 1024, 1, 32, True), + (512, 1024, 2, 32, True), + (256, 512, 1, 16, False), + (256, 512, 2, 16, False), + (128, 120, 1, 8, False), + (128, 120, 2, 8, False), + (128, 120, 1, 8, True), + (128, 120, 2, 8, True), + ]) +def test_nvfp4_cutlass_get_weights_shapes(hidden_size, intermediate_size, + expand_ratio, expert_size, bias): + """Test NVFP4CutlassFusedMoEMethod.get_weights_shapes for alignment requirements.""" + module = MockModule(hidden_size=hidden_size, + intermediate_size=intermediate_size, + expand_ratio=expand_ratio, + expert_size=expert_size, + bias=bias) + method = NVFP4CutlassFusedMoEMethod() + NVFP4_ROW_ALIGNMENT = method.NVFP4_ROW_ALIGNMENT + + # Get weight shapes + (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape, w2_bias_shape, + w3_w1_weight_scale_shape, + w2_weight_scale_shape) = method.get_weights_shapes( + module, module.weight_vec_size, module.block_scales_vec_size) + + # Calculate expected aligned sizes + intermediate_size_expand = intermediate_size * module.intermediate_size_expand_ratio + intermediate_size_expand_aligned = ( + (intermediate_size_expand + NVFP4_ROW_ALIGNMENT - 1) // + NVFP4_ROW_ALIGNMENT * NVFP4_ROW_ALIGNMENT) + hidden_size_aligned = hidden_size + + expected_w3_w1_weight_shape = (expert_size, + intermediate_size_expand_aligned, + hidden_size_aligned // + module.weight_vec_size) + assert w3_w1_weight_shape == expected_w3_w1_weight_shape, ( + f"w3_w1_weight_shape mismatch: got {w3_w1_weight_shape}, " + f"expected {expected_w3_w1_weight_shape}") + + expected_w2_weight_shape = (expert_size, hidden_size_aligned, + intermediate_size_expand_aligned // + module.intermediate_size_expand_ratio // + module.weight_vec_size) + assert w2_weight_shape == expected_w2_weight_shape, ( + f"w2_weight_shape mismatch: got {w2_weight_shape}, " + f"expected {expected_w2_weight_shape}") + + expected_w3_w1_weight_scale_shape = (expert_size, + intermediate_size_expand_aligned, + hidden_size_aligned // + module.scaling_vector_size // + module.block_scales_vec_size) + assert w3_w1_weight_scale_shape == expected_w3_w1_weight_scale_shape, ( + f"w3_w1_weight_scale_shape mismatch: got {w3_w1_weight_scale_shape}, " + f"expected {expected_w3_w1_weight_scale_shape}") + + expected_w2_weight_scale_shape = (expert_size, hidden_size_aligned, + intermediate_size_expand_aligned // + module.intermediate_size_expand_ratio // + module.scaling_vector_size // + module.block_scales_vec_size) + assert w2_weight_scale_shape == expected_w2_weight_scale_shape, ( + f"w2_weight_scale_shape mismatch: got {w2_weight_scale_shape}, " + f"expected {expected_w2_weight_scale_shape}") + + # Verify bias shapes + if bias: + expected_w3_w1_bias_shape = (expert_size, + intermediate_size_expand_aligned) + expected_w2_bias_shape = (expert_size, hidden_size_aligned) + assert w3_w1_bias_shape == expected_w3_w1_bias_shape, ( + f"w3_w1_bias_shape mismatch: got {w3_w1_bias_shape}, " + f"expected {expected_w3_w1_bias_shape}") + assert w2_bias_shape == expected_w2_bias_shape, ( + f"w2_bias_shape mismatch: got {w2_bias_shape}, " + f"expected {expected_w2_bias_shape}") + else: + assert w3_w1_bias_shape is None, f"Expected None for w3_w1_bias_shape, got {w3_w1_bias_shape}" + assert w2_bias_shape is None, f"Expected None for w2_bias_shape, got {w2_bias_shape}" + + assert intermediate_size_expand_aligned % NVFP4_ROW_ALIGNMENT == 0, ( + f"intermediate_size_expand_aligned {intermediate_size_expand_aligned} " + f"not aligned to {NVFP4_ROW_ALIGNMENT}") From 6b5ebaae3ebccec4fbbb79290c48e6ee477e6e29 Mon Sep 17 00:00:00 2001 From: Yihan Wang Date: Tue, 16 Dec 2025 13:15:25 +0800 Subject: [PATCH 165/172] [None][chore] Update internal_cutlass_kernels artifacts (#9992) Signed-off-by: Yihan Wang --- .../tensorrt_llm_internal_cutlass_kernels_static.tar.xz | 4 ++-- .../internal_cutlass_kernels/aarch64-linux-gnu/version.txt | 4 ++-- .../tensorrt_llm_internal_cutlass_kernels_static.tar.xz | 4 ++-- .../internal_cutlass_kernels/x86_64-linux-gnu/version.txt | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index ac28ba8f9f..7c7ced1061 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a345d90233d94c0b3f6b9f5c6e79152852354e174f0edd68f00c2554e9e32b5 -size 67111548 +oid sha256:35e57babe61b004d3b5cd9b3f27c28082c41299bafed1436c34060f95d457ae2 +size 67079084 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index eb6005bb71..c93a045165 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -389ecc2585d407dcf336cfb5d1fdf7cdf77922998b0560743c5b162172fa57c1 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 9fc66c405c7caaaeb65542ba1498f00d863f0a4a +843e77cd5a31b18f3238118d467e0c985901bce4f48476916c643083fb7ee062 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 5a8266adf797b8e01be54ecf24d0b42aacd894c9 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index 935aabe42d..e8e22e9ffd 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d74cbe0df4f798fbc0c157280ebcc734ad6d1897ba3b43026e4aa22a2a4480a5 -size 66904288 +oid sha256:1f9af5e75bb37073d349889a4c0ad5ea8e4a4d5bbacad79a21913018dd851052 +size 66891640 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index 4194d5219e..9a1851f2a5 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -a396f947f273fc752469160c9ae83caf393017d096cf4881ee09ad6af64296e1 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 9fc66c405c7caaaeb65542ba1498f00d863f0a4a +f614620abbdf34285b3a41a151d0efd3a02e455b557357616204f4980e53f8ab libtensorrt_llm_internal_cutlass_kernels_static.a +commit 5a8266adf797b8e01be54ecf24d0b42aacd894c9 From 28b02b4f5ac68a4d06a6a543990b29b9353d9d17 Mon Sep 17 00:00:00 2001 From: William Zhang <133824995+2ez4bz@users.noreply.github.com> Date: Mon, 15 Dec 2025 22:17:24 -0800 Subject: [PATCH 166/172] [None][docs] Add README for Nemotron Nano v3 (#10017) Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com> Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Co-authored-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> --- .../features/auto_deploy/support_matrix.md | 2 + docs/source/models/supported-models.md | 1 + .../torch/auto_deploy/support_matrix.md | 2 + .../core/nemotron/README_nemotron_nano_v3.md | 194 ++++++++++++++++++ 4 files changed, 199 insertions(+) create mode 100644 examples/models/core/nemotron/README_nemotron_nano_v3.md diff --git a/docs/source/features/auto_deploy/support_matrix.md b/docs/source/features/auto_deploy/support_matrix.md index fec6d841af..9c9d56bea6 100644 --- a/docs/source/features/auto_deploy/support_matrix.md +++ b/docs/source/features/auto_deploy/support_matrix.md @@ -84,6 +84,8 @@ In addition, the following models have been officially validated using the defau - nvidia/Llama-3_3-Nemotron-Super-49B-v1 - nvidia/Mistral-NeMo-Minitron-8B-Base - nvidia/Nemotron-Flash-3B-Instruct +- nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +- nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 - perplexity-ai/r1-1776-distill-llama-70b diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md index 40f3840073..d4ada87f58 100644 --- a/docs/source/models/supported-models.md +++ b/docs/source/models/supported-models.md @@ -18,6 +18,7 @@ The following is a table of supported models for the PyTorch backend: | `MixtralForCausalLM` | Mixtral | `mistralai/Mixtral-8x7B-v0.1` | | `MllamaForConditionalGeneration` | Llama 3.2 | `meta-llama/Llama-3.2-11B-Vision` | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base` | +| `NemotronHForCausalLM` | Nemotron-3-Nano | `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8` | | `NemotronNASForCausalLM` | NemotronNAS | `nvidia/Llama-3_3-Nemotron-Super-49B-v1` | | `Phi3ForCausalLM` | Phi-4 | `microsoft/Phi-4` | | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/Qwen2-7B-Instruct` | diff --git a/docs/source/torch/auto_deploy/support_matrix.md b/docs/source/torch/auto_deploy/support_matrix.md index f0158253dd..037585461d 100644 --- a/docs/source/torch/auto_deploy/support_matrix.md +++ b/docs/source/torch/auto_deploy/support_matrix.md @@ -83,6 +83,8 @@ In addition, the following models have been officially validated using the defau - nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8 - nvidia/Llama-3_3-Nemotron-Super-49B-v1 - nvidia/Mistral-NeMo-Minitron-8B-Base +- nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +- nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 - perplexity-ai/r1-1776-distill-llama-70b diff --git a/examples/models/core/nemotron/README_nemotron_nano_v3.md b/examples/models/core/nemotron/README_nemotron_nano_v3.md new file mode 100644 index 0000000000..dac512f47e --- /dev/null +++ b/examples/models/core/nemotron/README_nemotron_nano_v3.md @@ -0,0 +1,194 @@ +# Nemotron Nano V3 model + +## Overview + +The Nemotron Nano V3 model uses a hybrid Mamba-Transformer MoE architecture and supports a 1M +token context length. This enables developers to build reliable, high-throughput agents across +complex, multi-document, and long-duration applications. + +This document outlines the procedures for executing Nemotron Nano V3 using TensorRT LLM. The +implementation supports both single and multi-GPU configurations via the AutoDeploy backend. +Additionally, ModelOpt was employed to derive FP8 and NVFP4 checkpoints from the source checkpoint. +The model repositories are: +* [BF16 repository](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) +* [FP8 repository](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8) + +Nemotron Nano V3 supports the following features: +* BF16, FP8 with KV cache FP8, NVFP4 model formats. +* Single and multi-GPU inference. +* Support 1M token context with long context/generation sequences. + +# Usage + +## Online serving example + +We can follow the configuration file from [nano_v3.yaml](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/auto_deploy/nano_v3.yaml). + +For the server: + +```sh +# Example configuration: +cat > nano_v3.yaml< \ +--host 0.0.0.0 \ +--port 8000 \ +--backend _autodeploy \ +--trust_remote_code \ +--extra_llm_api_options nano_v3.yaml + +# OR you can launch trtllm-server to support reasoning content parsing. +TRTLLM_ENABLE_PDL=1 trtllm-serve \ +--host 0.0.0.0 \ +--port 8000 \ +--backend _autodeploy \ +--trust_remote_code \ +--reasoning_parser nano-v3 \ +--extra_llm_api_options nano_v3.yaml + +# OR you can launch trtllm-server to support tool-calling. +TRTLLM_ENABLE_PDL=1 trtllm-serve \ +--host 0.0.0.0 \ +--port 8000 \ +--backend _autodeploy \ +--trust_remote_code \ +--reasoning_parser nano-v3 \ +--tool_parser qwen3_coder \ +--extra_llm_api_options nano_v3.yaml +``` + +For the client: + +```sh +# Simple query example from client. +curl -X 'POST' 'http://0.0.0.0:8000/v1/chat/completions' \ +-H 'accept: application/json' \ +-H 'Content-Type: application/json' \ +-d '{ + "model": "nvidia/NVIDIA-Nemotron-Nano-3-30B-A3B-BF16", + "messages": [ + { + "role":"user", + "content": [ + { + "type": "text", + "text": "Hello, my name is" + } + ] + } + ], + "max_tokens": 128, + "temperature": 0 + }' | jq + +# Simple query example (with reasoning disabled) +curl -X 'POST' 'http://0.0.0.0:8000/v1/chat/completions' \ +-H 'accept: application/json' \ +-H 'Content-Type: application/json' \ +-d '{ + "model": "nvidia/NVIDIA-Nemotron-Nano-3-30B-A3B-BF16", + "messages": [ + { + "role":"user", + "content": [ + { + "type": "text", + "text": "Hello, my name is" + } + ] + } + ], + "max_tokens": 128, + "temperature": 0, + "chat_template_kwargs": {"enable_thinking": false} + }' | jq +``` + +## Offline inference example + +```sh +python examples/auto_deploy/build_and_run_ad.py --model --args.compile_backend torch-cudagraph +``` + +**More verbose offline inference example**: + +Use a yaml: + +```sh +cat > nano_v3_offline.yaml< Date: Mon, 15 Dec 2025 23:38:29 -0800 Subject: [PATCH 167/172] [None][infra] Fixing credential loading in lockfile generation pipeline (#10020) Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> --- jenkins/GenerateLock.groovy | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/jenkins/GenerateLock.groovy b/jenkins/GenerateLock.groovy index d700a062b1..b389701029 100644 --- a/jenkins/GenerateLock.groovy +++ b/jenkins/GenerateLock.groovy @@ -38,14 +38,6 @@ def createKubernetesPodConfig() return podConfig } -def getGitCredentialId (String repoUrlKey) { - if (repoUrlKey == "tensorrt_llm_internal") { - return 'svc_tensorrt_gitlab_api_token_no_username_as_string' - } else { - return 'github-cred-trtllm-ci' - } -} - def generate() { sh "pwd && ls -alh" @@ -63,7 +55,6 @@ def generate() } LLM_REPO = params.customRepoUrl } - def CREDENTIAL_ID = getGitCredentialId(params.repoUrlKey) sh "apt update" sh "apt install -y python3-dev git curl git-lfs" sh "git config --global --add safe.directory ${env.WORKSPACE}" @@ -83,8 +74,20 @@ def generate() sh "git status" sh "git add \$(find . -type f \\( -name 'poetry.lock' -o -name 'pyproject.toml' -o -name 'metadata.json' \\))" sh "git commit -s -m \"[None][infra] Check in most recent lock file from nightly pipeline\"" - withCredentials([string(credentialsId: CREDENTIAL_ID, variable: 'API_TOKEN')]) { - def authedUrl = LLM_REPO.replaceFirst('https://', "https://svc_tensorrt:${API_TOKEN}@") + withCredentials([ + string(credentialsId: 'svc_tensorrt_gitlab_api_token_no_username_as_string', variable: 'GITLAB_API_TOKEN'), + usernamePassword( + credentialsId: 'github-cred-trtllm-ci', + usernameVariable: 'NOT_IN_USE', + passwordVariable: 'GITHUB_API_TOKEN' + ) + ]) { + def authedUrl + if (params.repoUrlKey == "tensorrt_llm_internal") { + authedUrl = LLM_REPO.replaceFirst('https://', "https://svc_tensorrt:${GITLAB_API_TOKEN}@") + } else { + authedUrl = LLM_REPO.replaceFirst('https://', "https://svc_tensorrt:${GITHUB_API_TOKEN}@") + } sh "git remote set-url origin ${authedUrl}" sh "git fetch origin ${params.branchName}" sh "git status" From 064b67e40cda7eface441f34a678160faf356229 Mon Sep 17 00:00:00 2001 From: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com> Date: Tue, 16 Dec 2025 16:34:37 +0800 Subject: [PATCH 168/172] [https://nvbugs/5727952][fix] a pdl bug in trtllm-gen fmha kernels (#9913) Signed-off-by: Perkz Zheng <67892460+PerkzZheng@users.noreply.github.com> --- .pre-commit-config.yaml | 2 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- .../fmha/cubin/kernelMetaInfo.h | 5855 ++++++++++++++++- .../fmha/cubin/kernelMetaInfo_cubin.cpp | 3 - .../trtllmGenKernels/fmha/fmhaKernels.h | 4 +- cpp/tensorrt_llm/kernels/xqaDispatcher.cpp | 11 +- 1949 files changed, 9747 insertions(+), 3904 deletions(-) delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c15d2ac081..b9dd903c6c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1395,6 +1395,8 @@ repos: - id: check-symlinks - id: detect-private-key - id: end-of-file-fixer + exclude: | + (?x)^(.*cubin.cpp | .*cubin.h)$ - id: check-yaml args: [--allow-multiple-documents] exclude: ".*/gitlab/.*.yml" diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 164af32324..d90bab0c1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93da3db51dc2c74c43549079c18247be8c423165c425be80c9b4171624c8a1d7 -size 646191 +oid sha256:0afc687e183286972166696ff33bedbe8dfa8bc6bdf3213d25ac909e5f3040a9 +size 609323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 42a02cf931..3c021b27d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d49a0d4280f06012123d03c1f19b2c21eb0884b4621530741c23255c5b1857b9 -size 590830 +oid sha256:ec9b5930e50e96a57b1c4cdb147dcbc54a484d2a8c8b25fb3587f4bc0378c12c +size 545131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3c15b9800b..b67846ff83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abf634eaf82d7c72b7202706c41bbbf1027c049bcd78e899207c77b344b5d47e -size 626655 +oid sha256:2ae138ce71c60752175fb660cb30f1633028a0cc3592a30ea2601f50394f87af +size 598271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ca022b7624..505d7c9b0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79083440bb77f1423d1b3c33e6ff6e8434a9e95fb803f7a2dfaea1f972eb62c4 -size 574254 +oid sha256:8dded9944d41cabd7857a6b5ef229e05a843c191774997662029dbf09026a2a3 +size 534081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0003212fd1..ba803a17fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56be9d2c764a380d45775cb93ebdc59cf6840267a695ad7910306c316d6e5a6d -size 490794 +oid sha256:0759014bf3c82064f8be83eb50850b8da2e90b267f41cfdc969224620c16c6a4 +size 460315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 74349949b0..f27ff35141 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27fa306c15c04506f2eeb49456ad1920dc3a0ec06cf142f82dab0d3b6d8a5e51 -size 457936 +oid sha256:3eac32eb1df49520c891b1183e152875e9e22e36fbda48e5708aeaadfd4751a3 +size 429011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7a71bd7c80..7fba6fa677 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f05b47b2541185087b3d3502a98b6e1fcc50f10ea4f5fbcb173f0a7063392a7b -size 481636 +oid sha256:090271f2b5789f2814042c9bd0544d4cf52860fca9d295535d1c5813766da6c9 +size 449579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5acefe2718..a4903bf64a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b7ad324102fa900c47b2cb8feaca95cd332b84eea0e6ced413d7da86103ff8e -size 454304 +oid sha256:e48501fbee77407818bd30591048da1c6ec16bb69e358b0482ae023c34a3f015 +size 425403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b77af8541e..5fe6b2eb02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a140c2e2014fba58b2789fc18349a9d8ccd361448ff59179ae5839daa414e477 -size 638789 +oid sha256:221938cece8855eb304e324d4178be760c242d5d79688c1dfdf733a32dbcae64 +size 605373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 43642aa183..f212fd3581 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:154807c5077953c2a5585410562d2a4f5ebf0b1b014f4a32ff70e1b65b193a1c -size 586906 +oid sha256:40cac8f1f30e356ff10e785dbb8c221928d0458499032ccf00b4e01d5be23010 +size 543625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 549c4881a6..aed4de6227 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:983c3043a48bae92ac6e564c62041d37fbeb5995bfa5e1cf87030dcee8f63f17 -size 447238 +oid sha256:a3f6fa1bfda76ab86d3efb46f8a1e12862661a59e4b2f66f158552fe8c449a02 +size 428597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bebeccda29..c758b71c56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:712e5ec1c49ccfc014430442ab3944544bd32724f3372956afef4eab55548bec -size 396366 +oid sha256:01c6243daedace544fa3503b27fed667b312946c91c0546a5c11f9bad1a26193 +size 369833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c5a5f3141b..08f49fc492 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6badebbedf2079dc2f50340cb401545a1592de7093c79d3e08c3d27fa1191391 -size 429376 +oid sha256:0e1285d9312d0903da950160e1f4b7da9f77a72d6c1483a71c09566c32f7be22 +size 412315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index db4881629a..e78d322513 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0480d9b5c03425111629b47a6660b071e2b49b70417a56106d01d077a4aa7a4d -size 378506 +oid sha256:811b96b34e0922e2e87716e2ecead2d5827245e73793092aac0216a7e37f6cdf +size 353551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 30cfb4a22e..5af847efc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7051e49c82e0d10102950a887d639c218b16e20d266cc3d7bb702154a299339 -size 474218 +oid sha256:72673296d19f9a875c19be8bd47a17d2e6d93889fac3ead8ae5754fe4b6100bb +size 449263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b690bc6ee8..f692e63989 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79a038482959da988e9ccf405591b1e5bc2f02213f3bd3590f44f32ca1eb36ce -size 441360 +oid sha256:1548dfb89b0ebbba33b02c73a6ae2ae79a34768ff7bbd09a78e5d1123cdac990 +size 417959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e901d16033..03d9ab6366 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce5c3c5fc0a318747990414fb172712b83c28d0ed501309a7cd2fbe786b67f9f -size 465060 +oid sha256:b556c9490648423f0408a2f33461a73a53694ff3cb353ac3f7b897126e3ae0e8 +size 438527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8cc3153d4c..cfc24707bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0643f9373aa0a579efbd7b01b967e535534f6c123057e954a7eff2bef1d2933 -size 437728 +oid sha256:480b93ea21a7b8fae2a1e0e859812de21f6e31c93eba1ac48fa5e94cce9e91ff +size 414353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 102aa4b54b..b22087e195 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40bde6ad280f45eb133eee398de57aa841d438ae72ada024a77374126d30faa7 -size 621917 +oid sha256:59ce7a5a8f3310ded18a2c1f06d7209f378bbe33d2c896b74ff39be782bc6ef2 +size 594323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 64e3aa0143..c0d615d0a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72f04b54572591e5cf0539628ba89b2ae45fc43f38f764590b60dfed291a7415 -size 571910 +oid sha256:5454594d90ce70e3e307c1e13c64f9172447efb1fcc9121ae30febfdcf309a7e +size 532575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bb0d58ab92..f26edca81b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:133585e8799982a224109cd76c24057f64c570c5815a163c75a391ac2b2ff6e6 -size 430662 +oid sha256:b08910e66cccb8f148cf5915198ded637fd1d114d5eb6e8fecb9181f58d52a59 +size 418337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 41f6b44892..e6b4977dd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10dd0e900165da3946a290fa7644b3965c679b85e4512c836b3cb6f88e53a53a -size 379790 +oid sha256:05eb3611265e95f2b90110c1b891b5ce634ba7e00436f333a18e21638f92eea3 +size 359573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1826f73e87..3893f21562 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97bf0b35d7adcae83e2614b287eec3477407910591747fb9bf7a8f9d2e4100dd -size 412800 +oid sha256:6cbbb2adba9cd8c5f344146b8aa736e386e1349b2f6a885ec9f16e47d2babd70 +size 401265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbe9a19ca2..f1a202e50e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38c475483ad75da7821f3a907a4f2f0cf0bf57630f8908ce035daa75b92d79e9 -size 361140 +oid sha256:cf89b20f4c91ae06d7835c8293763bfbcb80fda4aa1272fc36236a70879f5632 +size 343291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a950f9702b..9a650bb332 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:065d4ceb129aee11bced17a64c72d8bdbc49213705e70f6e2fbbf9fa038533cb -size 505826 +oid sha256:e2b3f95727f9d0b67f23bcfe3fc6e5a70f29165572216da801563d40569fc45b +size 479293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2d5cfcbdd2..6c1ccf9fb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8089c7c268bc3a172f4cb14a6ca3e670fa50ca3b3fcbd329ba86f9f9c9e175a -size 471388 +oid sha256:46db4b56250d0c57cb59ce3c63219b84e9829663941c395cfe7e8e27c3ffbc85 +size 448777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bc927e6ca9..633e06d577 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a1f079dd99ee5f17825c71799f6a7a0b36e89398e16a9b93ee23c9923ad05ca -size 495878 +oid sha256:7a623c03909951ebe246ce9bca4fedb8f9b51fb6b5cd4789f84052bf83c6ec6d +size 469345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a43bde3d1..23225ad3bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed86e9c733c048bd520786a10cf1620bc5c3071d4d84532ff2c904790cd60b02 -size 466968 +oid sha256:8cc2f9a0cc088cf90ad3c8dbd7d2e3d7afe727566f3f51e2c003713e147e7ef3 +size 443591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b8bd776ea7..20669b2fc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96f9657094acc07b98d2ee41bf79e7e397a2ff1a66254a2d70040d1f12894a92 -size 701575 +oid sha256:6fd8dbe74d29b1e5823d39ffa5cb970796100e83dfbe1b8f57fd53059c700378 +size 647342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index beceafcfe0..3d9d2f4de8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51c90f7a06bde3f283e4fa78ede68f390a2d74a1b742827ed28bfa53ba0af38f -size 644193 +oid sha256:e718f24c5621e72fd733ab92317e5865d233ed9c6abd8a23e9aa1710ac5ead3b +size 583595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index dd6c7978ac..bc88c068fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d5f6eca62c7644bcce981f107b09dd82e95d3e1f8b5a1dda38fd626db0eb411 -size 481878 +oid sha256:4368ec5ecfb8fc89c992eb1a5f7044b35b3cda654fe4da7fbeeb7c220bcb8296 +size 452213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a8d6936fa5..17dc09665c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1aabf34bc8b83d18d0c34f14bbb905247e02e38bb5e423a1e040fb3458764e8b -size 400224 +oid sha256:5e8c2d9ff07f0342eda303d760666a14aebc726455e1584b5b618a970eaeeef4 +size 383163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c1963d3644..4fbf570ed9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:327df4bcaf3e7bff22742d558bfba0e0fecb7ab3da6c79a82a74665d357385ec -size 457704 +oid sha256:a22afbc2465200fbedf95628d2f1bc6aefad7bcdccca912901540792f1735297 +size 432773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2312e17f5d..bf3aa5c748 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57df5394b6e47321c21553a330b45b4003e127196103eecc23921ac165f83d13 -size 380784 +oid sha256:b9b1831e5b9d3fb25a8d11fe005dbaa8f6cb722260c5eec24c7ceb327d0b98d9 +size 365303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 260fead36e..3f267962ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60f3586c65a1c05a88e0729da3bb6e6cbc48f6bebd2566efefe82b4bbe1e2cc3 -size 489250 +oid sha256:c1925ff51f32cd31e21c81d8ccd1722a5e45178c06240d13b0bcb83e3a94fc5e +size 468241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c34084fc46..a2c0a1330a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0af68c71338072381aa01878db5a816156b518109a058331928d26d0e100548b -size 454812 +oid sha256:c902c68ffd17c3c51d21b5b118021418210e0bb0db24d4320e4aa4b1e4e13af5 +size 437727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d095f76ec7..8c8ca5c7c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95e0c23f48a26672e87787326ec4c21b2d88203a7ae621da3db7eb4c0e5c38ff -size 479302 +oid sha256:2faa1170c9ee7d6fcdc71b49ecad3d89466d25cd45b81e878875bd953ba82a73 +size 458295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e8b01023bd..6ef9a0848f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9d4b8fc32aadbd4ac6a3e379fb72cd5b6be96977205b301c0dc6468a3fd5274 -size 450392 +oid sha256:cbee7ef245739a75f79c846a74a6fc55119c664f03f4545a41f6ceebdaa33573 +size 432541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e1df63ebd8..870107f7db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4593f7e2d25319baac5ee97eba18cdbe22c5a3dda77c6fafc0fbc4c16f163988 -size 682039 +oid sha256:c1795aeff12231948d225fe6bfb39a98a459b8517ed767353a6b175e2c598b02 +size 636292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b9bc402575..16b26c1943 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b812ec9d0a8259418346cab34054a359eef4bad1d97abc6ae2f34c854244f92 -size 626827 +oid sha256:6d004a3c852e3024ea7646b04fee6ddb86a46aa2b2076652b780ed59db9110b5 +size 572543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a33972ce44..4da34aa438 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8afe73f4880ce3db60376c3da8e23b7d1d7b9d43243b8f4dc7242fb6f8b0c56c -size 460566 +oid sha256:ceb815e908297617c3cc4277b60ccc97c03796e0e810c36c3bc6d43755dcf471 +size 439583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5fa2d29a3a..ace0fe997d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:165f4d266232b757878fb4ddfa5e9ca8b6eaf859de430d9db34aa94bcc26753d -size 385226 +oid sha256:d3a0454ef56f20bd64689b03152e6cc62abc7c3dc9ddc93d763bd2727902a3f9 +size 372113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ad8f72b00d..44bff86169 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be8d1f839c20ac0404f901064a74e1edb22a9a4133f8291799321849b19038f9 -size 436392 +oid sha256:844b18b29e3f4bec4f50cdb9182d1b7a318143fc6eaf8c54df08ed7613713f22 +size 419355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a660f53f43..812faa4d44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70ef01714ae6a43a4ec4ea46810196cdf3c8a3fa0de254a21f86709761da3c74 -size 365788 +oid sha256:14fe3afb9af41be719b48de9737c94dcce14f3e79508d64e436c5f0c062155f8 +size 354251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 17f86216f5..53978616df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5332e744cc896ed0806d76a401ea1d6375d04543cdda4da6cdc39d3bb3ca756 -size 687015 +oid sha256:c1173104b88f6b54b8b1998fae637cfa007e2a75a4caab87a4134b6ffa495ffb +size 638998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9a6d259381..bc4c8479ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bad078994fbc0138bd20118fe1a58762e6bda82d853a12d8d57cd7494884ff2 -size 596282 +oid sha256:bb18f305f16a28295c83579c36310280a165dba85c8b117fe4e93269af3cb936 +size 553617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0485667eea..1fe17f0953 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a797733dbbe12b8ce9e88ea899218cccc835fa9e1aa7a59ced0cb2c24b4e180d -size 648437 +oid sha256:32cd965d5de74d0422d160b581f24e682968d6d532db82cdafcb90e4bd80e600 +size 617685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b1a3585e09..fa49264cde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2954719c224c9f41c4d49846ec419ef65914b32f33d3549487d6c7cbfa9b218e -size 560762 +oid sha256:7455e79134289fbd542b7f7abee3c7ae1273745352e7e3ea53740591007eb08c +size 532305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 35b498ddfe..c19adcb326 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6a1ec3025b9d8e4383aa0a5d6ab4aabb7b4722960a18a3b4570998a9d506236 -size 624981 +oid sha256:9f067d5344d1bdfef2e86308b188c2ffeecc756193ec0091b89d2ae1fdb47c40 +size 570821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f04dabcb4..fbdde866ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6984c53944d4a207d95acc63936d535443561ae6b957be87607410896cc6ca49 -size 597648 +oid sha256:7f3fefa50f011446b3273bf09ada4d63fc15786b649b938c2454acd1e90f2f1f +size 545831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 77867b73e1..ab7911b14a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a37fbe08db8771b4c234c2ecd1a74655fe71a821c9438eb648afbc25b68e0ca7 -size 614244 +oid sha256:4ee27b32516fde1d060bdf0aa640277f2d261c16db874a733be071d4860bbacc +size 557717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2ade4aaffe..80fa409c8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3243364468a0d44768d2144f2cb11e50ff408cb8b1a86e4c5a6150841c8042df -size 591648 +oid sha256:09b3ff8afc266cbb50a99bbbc97731500d0dd2d413290fac302a4977d700a0c2 +size 537489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 33b31d994c..8f2255d943 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a9a4821030a08be606feb8b5c581561b7287e4aad21daf5aae286596e24d8b7 -size 691255 +oid sha256:fec44028cfe46baa466c2a0e6b1a36a21e4959bc17ce0047e0d81eaaaf7f975b +size 639588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 85532bdd6f..19822c7def 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1d02f535560d7918d1da51a43a17a98713c9e9712c1af075606684efcebab62 -size 637499 +oid sha256:632f92da0d61586cd4bb0a780ca01a10eb82745a8b7c53ad02c1689f573db536 +size 587679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 14186a7759..21b3b740e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad2341ceaccad4f99a9fa77077c73ea12d51d782ae87f2c7657b297ce3a7e0ae -size 572742 +oid sha256:7405065c7c7dd6386c40328b674202dfb0f5d7a336e795647e17be1586a75178 +size 534369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d5923b0a6a..c0df51a435 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59b0d398d37e92de77f29095a0a32658ba48387a87d791313c1470a25c125eb5 -size 523450 +oid sha256:08a03bc885d1b000358e43bdf79aad6785b1ac9d9f9ee058aa7a2919d7fb000a +size 477183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fa0df02455..5e5ba991b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:338c0f4bbb814dac0a15e39cbbff9039bdf869a231212d57110cc7c0eb051609 -size 552512 +oid sha256:de8703173c3277546fca9eaa2a78bd8a9864ad3a3633cc8deedd31ba88985fa3 +size 515719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3ae66ad282..f115501403 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:102eb7cc918b90a7830e2c0e6931312d63618a5f4c60342a30e9dcc81a4f7fdb -size 503220 +oid sha256:f2ebe23bf99ef53c48f8d868d7346323cb76416259b10201e254bc6fe849a699 +size 459323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7bb734daa4..9ed86a5c5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc11fc72935a9f154c0fde530c448d30c3888ce5c490073fb951fdd9bdef7535 -size 587092 +oid sha256:06f08ee0709815162c8f36b4d9db4498dbcab01d74df002f2c483eb1df9b8499 +size 549509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d7b293c2cd..b123f2380e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6942c1f3f93e68c4e7cdcdecf6c5d8dae8bba5f2349ae106fa11fc9c82df6df -size 559760 +oid sha256:48f0343461f9424db63327346be6d760f99428a5627a5da5652b22a1e96b2963 +size 524519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4ef89fa14f..537d8a714b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12c9e9c5de52e5957734c58fea720e44a8a8b6727342efcd7bffdf84080f392e -size 575568 +oid sha256:e71fd1ef2676d0c8f3787c805b1d180da49efdbaa1b676d0239a723ee8b804f0 +size 536405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0f8ca5212d..f343cc7b26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30f31afd6adbdaf917014bbfd67d3f2f0b2c78612b43ec687aa15152034ab2b8 -size 552970 +oid sha256:9e71e703fa71c56074ff88720e1073a12526247b76f570b5d4e6e8aa6eb009cf +size 516177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b609b34d24..54f8443b21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dbd0a4fdda0f1b27752494513515ff6216cde3e6dfe1564848a4ff38f4464d3 -size 649519 +oid sha256:1baa033be11890063bb73b85eadacda7ab091eb0d7cdcdcb0953ed67d1a556af +size 618276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4e0d52623d..7611350e87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a4479ee73e40e0fc7fdc2d9a99caaa5277e4bbc5801d73ee130c6e9d10fa4df -size 602768 +oid sha256:4584edb326b07a05ef0c58028c3b45e88e8bb5e020586c709a4ea2af4a8d8f6a +size 567157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a40a0c1cc5..b31dfe4689 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8467693421a3b6e1f35c49968dc15bb4e73b46646a9af2a00bf5e2a4027c44c3 -size 536432 +oid sha256:6a1914c862dd23dc77f71070c7d323b4dec4467dec31149b0075de776e529d70 +size 513057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bfa451bf3..289e01e185 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11e8aee556153ad65fad19236f50cb87c1b0c9c518a56612b4a7579bc177bd74 -size 486350 +oid sha256:3d7a75c385d355c4713b5e5858f43aeda62863d3f104d71cf78a3d3f73cc81ee +size 455871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e577a95c2f..2d7a14847d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7d5362499c29096ee4e0c7d1fbdfa2c1f836ebdbfeb8092a9ef18096a054035 -size 516204 +oid sha256:53e6c7dc373eae865ea65ad6251cb44970b6a83f62da9aa2170a70f4c20a95ad +size 494407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f8fd5393f5..8ebabe1d4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed9a9f8b7761028007007937e2775e299670804075a4cb8cc795dcf5f49f3ed8 -size 466122 +oid sha256:924a56dfb6f59402cf82fc674408ddaab843b04e31700cf973c18303c3c46413 +size 438799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 186cd21a6b..13b994d799 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2cc2a41306c8a19875e2951b0689ed3f92a5021b0df2e844f4461a11ff084f7 -size 640803 +oid sha256:f1150d780f008ec7e3b602117dca13fe048407fb3035818525c9d2f7e1d19142 +size 590589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f01ffe7a66..32b883011b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8009feb523958aca6a46e40d7d3e2ba7b6a4b773941ec4c1c8b17583948f33bf -size 611890 +oid sha256:e26f4db8db5693119d677100c08d7d1aaa86a7ee38e671bd5f1228f8a6e213d8 +size 564809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0ee13ce194..9d5eb15275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e6b358b90095f24f7679b8c4a0ed3a197c532b36999c4a27a9e490b6d665775 -size 629277 +oid sha256:97f705e6962eca45a27300ba02f79eb81eea9b1ebea4e1d68f26c00e147a1a89 +size 577485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f80a72540..431f578671 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d42910094ae7245c01f2a1735bdfa394a63e0ac57915871e27b43345cf460ace -size 605100 +oid sha256:067a89e42058a1e38d309a68b6363af2d1981963849c808ea7ff9cb986662ca5 +size 555677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 62c96bec0c..1fd8ee2c93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f33752bd17738a5a5651368efab153d542301fb864e87dfbbf3ba7df0ef89b0c -size 749009 +oid sha256:c4d9bdc67d9bd2eed43d35ef189b03cbeebde7ef3bba70be4d8599b805b6e8fe +size 677016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 576b63cab6..ce896d877a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9de61de96614d58c1d8c3a9c72ab3e355fc59caefe1474dc2bd60c82f340c163 -size 633315 +oid sha256:1100f6174f982fe1764f1d519fc9fe38aa4bcdd00e667171176ab08e97874ab7 +size 592425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5258c2587c..6c3a9f2a81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:495bd3ebcdf6f6f2270ab5c10825517b065303dde45553f10b65ab333a740126 -size 606594 +oid sha256:85031dc34441a75463fde5db1f25ec625460ff495e3872e48206c10ec7b87d05 +size 557195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e25386cfbc..07a1db6bce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8396e31640be20e49854d7e061dd9aa451516796c4d1b8b4236186a26f258240 -size 523360 +oid sha256:6baf149c4058560d1b580122d829c8dec65f25552b21825ec7e6bc8c5fee4645 +size 484987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dcdb58a2c1..00e9f4fd6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04ad51d8836ce6cdf64bf267731bc07b340a205f3bcf6ccb7842085042151396 -size 581628 +oid sha256:63fd9cb67162865d01f06aa677f528de0f255f8988e9d9bac5ae64ba8e5f9e6a +size 536177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6bd0838136..79fc42c281 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fbdb691bb14f6b7bd7bc0188171cb14155f0be066cda3687ec7e0947d633157 -size 503920 +oid sha256:854270c05f23fd8996cb24332d52592d60a9922d9c0e48e096d6b305ca1853f3 +size 466337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3e6c056a7f..da5dd7faae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4dcbb1fd7db30c312e7ff37754e87b7f022b52973359092dd6fafe05767b0b5 -size 602124 +oid sha256:d64b8964521d3479c2bf9ea6c73daa0ec70ab58ee62cbe94d0f0a4bd58f9f062 +size 569277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 22c6f0e114..5cf916cb1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b89e910fce8afe4a1650259e5628c2c79535fdfadde7063a3c4cc36ba98e6dae -size 573212 +oid sha256:2485e712013bc555685f145decf7b439be95cc7218a41f0a1ab7ad5d9b41d1e1 +size 543497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7e695e0394..53e0677810 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b416a3295f7fa46b5ef9cf3b2fddbfdb924e191eb6758ecd722b35805d59d2fa -size 590598 +oid sha256:5430dd5303eaa83e2d42d18af80cf465e8931edff54f6a8623ee5d028c6aec9f +size 556961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d407ebe47e..6b4e73f360 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61c360e55f8a5dd89601bec6a1047b3c1d7269cd4980d34790277d5291479897 -size 566424 +oid sha256:dbb0fc5e8ac3843fe0b4cd925beb64040e5a10554b8c0e4ec93b65e48e0eeb48 +size 534365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4f940146a1..7e26867133 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:267e6057b6bd4fe3a94adc24dbe0495388976874a6397c7438365b9482973219 -size 707273 +oid sha256:f12fa046ddbc44cc59819330e4167ff8c1543a64fcc70eee2e07c590e984a42f +size 655704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a529a0bcdb..3994711074 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56b034f9974955e092c407ed23bbf427593b43d2c5a4de352d1b5fe2d1c13b01 -size 597004 +oid sha256:94574d7d32795256b765e3048c8b2bfdfff67e54ff7a968eb216caed9790608d +size 571903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 09828e0fd3..ed912deb56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cbd4199b9a5929907cc48e68ac53cd0c16cd3e30d4a2d8bce8b4bb7dd722755 -size 566338 +oid sha256:f028034495299162e677eca9d3634662a365b356d70d3ccbea8639a815ce92c3 +size 534303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cd998ce097..f8d56f8d5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80b25b6631be05309453f6fea1c94f03afa22ac794966406958d5ef0c54d5947 -size 487840 +oid sha256:d365f379ceec42a6dbb4d465dc50262413b60c00b0a3e62601f6bbf5f20c1171 +size 463675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 88c2340b53..a02bb62ee5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d13c8a5ff0a64d243d633c04bd9feb6c43e8369fd0a748b949dbcf09e6acb798 -size 542162 +oid sha256:21f3444b7938619f6eb9b4f49f9bddc7fa259ab5b1e2929702c5ef56835cf7b3 +size 513285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 754738c0e2..d6752628c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebf364a63f6b8b18bee09a555fb9a80e80768862c21ff11e540ec7649a15b592 -size 468400 +oid sha256:c25c9dbfb3cc1b4aec205d19bc28c7ca5fc1c0305fc2d9b15ef1ddbbe50f6bbb +size 445025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index cb973cc15b..14eaadcbad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af793d48315c7f635641e4977269c2220c90210d3e4c06c85f170f9b3a263109 -size 581660 +oid sha256:4dcbb77c074ef7a1c4345c33f481f353e12f243722c1809694e9913a11ffb713 +size 540673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index fe63d06ba9..9d8920d24c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5575d32f19de2bc79e0019c460dd9cfdacadab04f3f7d1423795e596e106cf4 -size 521096 +oid sha256:657335d1b467345921db997a17815f88f6aff345b7243c881498d41d0d3e25a6 +size 482131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2e5cfdc644..029cd36eb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81af6008028a6b1182d5e9919d2aa7bcfe1a70184e51487994f993c816d010ac -size 560818 +oid sha256:61f596a041ef16d34986e1cc8f095eb30993caed469f81cc5ace92d72f46a966 +size 529623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5a6521a354..fdce05829a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d9900baa4d38bb1a962febc14c5748ed693603e0b9e3d5353d07f8b248f0d97 -size 503730 +oid sha256:66637af56df4410a9eb388153abae05dce6b2672818ec29c0e464b93b868b9c1 +size 471081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d511f3c5d7..e316ae3ec6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12fe0a99bb49acc6effaf982ac94cca4ef8d63e96e6e8bfb0b9ffba31b24d2a2 -size 464942 +oid sha256:848bef441ff7edeb3dcee0fcea6f0428cdb2ac1bc3229212e01e64a6b25a9a28 +size 433697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0add7d454b..6645e04e85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:545a970d35d1a1b72fe6110e82e072ba1ebe1286b9dfcce53665e8a054171dfc -size 449252 +oid sha256:10790c4d0f5759b4178188d3c8f1a87964da400e9bd93ddf229805f90550894a +size 418747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b78c6b9ef2..ce09a7c4f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1278ea9908cce2b4095e8f2312aa97d2b94eaec477457203013e75855a77b3f -size 459730 +oid sha256:a61eeb34960ad5ce4b5bfdf13a26cb2c8ba49506ccf52ff5d5dd91e1bdf71e86 +size 428461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 594638daf9..064d9d93bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f291d926e676e4c8dbfe11171ba85931cf743634693d77393fab74c2cf5b266a -size 444040 +oid sha256:9fc43f0fbc5f404ca40cd8a96ec5f2f7b6017cc8a681d7d958f2eec874b5c148 +size 413561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 96f7c0b105..b914762cf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8188f3cbd205d1122c1d2e1cffef8d6b35c599c39e47a47e7642f9288ed31098 -size 574234 +oid sha256:b710411c2a2e5b1c5cbd2edd2e483267239f258f5ec47e8f604f296572b3de30 +size 535145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 95432f958b..a685e7223f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58e60b3efd13ad6926596b6a474d6a4b3d379668b191a972e3252599d90ea328 -size 517222 +oid sha256:f121b9bf126b1a1832066abdaa484c74e282d9a0f9e2e2355ed273fec1a99fa6 +size 480625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1871e2dd81..0f08d56942 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:547233ce34185393dd425eb4aa164b6712966b891afc8e16a22d40fc47cdae1f -size 434804 +oid sha256:1a085f128d810ae87aaf05ea72b3498059060c4257afc97f2940b8b12c3fa58e +size 417743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 656283ab04..b8d8430dd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40d0a8a97b1443bcf6a358cbc25d44b03ed9e8921cf6b082d39be366ebcb14ad -size 387090 +oid sha256:8d2d4d46dad2a7c7b76198e466af21c23a2a28f046d99265b9cb5d76d62df711 +size 360557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0b01b6c825..9a133240e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5a094a9d6486742775134bf334f2f70537afcd2bdb0ef78a58876536230faa7 -size 416746 +oid sha256:72eba0fba9f7cc132bb85c32a0a3d08cab09d42cd938a80deb5f6e848a5717a4 +size 400473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 37f1683140..38530083e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b1ec662cae5795eb7c37bb8ac9c33c08babf6988794017d34dba4fa8816e5b3 -size 369822 +oid sha256:521b17784b43f441db0958633a9fceea964fbd44853714cc80ef84cb2a7bb806 +size 344077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bb4d1e2d38..ee9df8d21e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a75cbce99c166aa84409a834e319ba1697bbceedf8291188edd085f3fcce8cc -size 448366 +oid sha256:ba4a12bf27d404d1b3902ee492300f967566523c43102588e6759ec5ebd4e14d +size 422647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e5207fe35b..8f3ab38851 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfbb8f14340e5f86493bb6846c68f8a8065efd368cd819307ec7640bd16976c6 -size 432676 +oid sha256:f1b01a8c387603bc75ff410bc863ecc3adf0daf5e438dcc13f6ec40543e323c2 +size 407697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 66f783f48b..ec48eab622 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34bd2498c215f9da55e02dcf22eb12ccc6e0617f68a8bad279936743b661a80e -size 443154 +oid sha256:4a850cb67b22d63a3d899c8c4958968a8ca7015a157766e728e8dd2a7b344d8d +size 417411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 54eeb7a733..63fc4ba601 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0d3474acc6c1d17294bd5ece4a054e13646fe0507b98a1fb2d3f780fa4abb88 -size 427464 +oid sha256:4ef3cc1f247bb8fa0b1ed7ce73b2b0b0f5166e11563c1d34715fa404ee4cfe8b +size 402511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index dacef2b2ab..f5912fc467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:383094d00fa1821275c1fea4eb7ab3bab6a6c5a1e710c98c333009a61d7de391 -size 556868 +oid sha256:ae50fe49b4dc1b0c287a08f1896454d102a0d9fa7abf3c6021a541fb1f7323e8 +size 524095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index d091dc4098..8e26c3e623 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05b308084b5a9bfdea183366288e294757374f319390ea7edd20648b0a35ac5c -size 502224 +oid sha256:1d61bc9c2e50fa2204957945879b168f288e1489b5b25cef5dd4d0ebe7cf5dcc +size 469573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a5b14cd45a..3fb497995f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5edabd6b2fa6c65fda61dd2b5e3c50070fe81837f0338d886a371d1cadd737e0 -size 419016 +oid sha256:fdfe86524036008fe263c70c34568d6cb4e064851c93d009280acfbf7cac756d +size 405903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ae061735a1..86be491b30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ab08b11094ff142cd6d857fb808bd1c58c59b4777b94a2fe8767b76b91ad006 -size 369724 +oid sha256:a5905e051fad13dbd0bb440c4d466e074a6efdc75624f66a02fd7d86296bd89a +size 349507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ba9caf8a29..2d1401b6e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b0b998d67069ee8880ed108c11e3e813ec510a3e837c644e495cd22bd975bfc -size 400170 +oid sha256:4865f15e1c7b1caa02e175db9e013561c4fbf8cc9032ce676e6cfe393171405a +size 389423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fcb4be1a1d..627bc2920a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:524cf0df7e9234d7d26d1a125d0b2c4c3bd6bbd33eb0f23bb43d41190c6bafe0 -size 352456 +oid sha256:eb7923cdc93324344f666f2164f41fa18384ee9a5c602dfea0bd8b0c1169f65d +size 333027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 685f536911..6d176c3b74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9af89f03ab38960e203628e8bdba567ff950e09d2e0f078524684175435d2f12 -size 479184 +oid sha256:8d0f8d92a8e7362c390f66074fd9ccf5dc981c53aea13443440ba63b07949946 +size 454253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3f3c13ac61..bdbd440bc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43365d8a2e13005ca234071f77c66f1627e48d430cea54d1d48fdd00bdb16880 -size 461126 +oid sha256:6a06e1c8ca41787efd6a44fe0899a3f57e140b9b935f8c0755e6105b5cddfa86 +size 438515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8879c0b115..313e91d785 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3cd74319646a5964772dcb4e7fae25e5ae1275b227cb2e4269ae76cde5b5720 -size 473184 +oid sha256:a8a0f70d6e80323e92a474114f6518f879c0569095c71a2e75a63ccaffad58b5 +size 447439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bccf2a4126..7c372f6a9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3427c255a7976d421abb8a4763cd6560705a19783917ac403630b45044a2648 -size 456704 +oid sha256:57b9bef075c989fa233ec769d80d5631f9654a350bc2bca15eb5346293e6fee7 +size 432539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 06cb54fa6c..09f9446aa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7d35aa53e2232dd9032832f64da8f6cfdd9cbf1161d269d441218179b6f6879 -size 635467 +oid sha256:012b24d32572672987c9f7ab6a0397d067a0f2b96e28303e5bed802cc62f64ca +size 578691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 31bf23aee8..cc1916ba38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:172b23bc5772ecfbab29e52452ce1889bb0d51e5505df991e3e73dd2844e4660 -size 573322 +oid sha256:766a0152e3e3dc1978edb095791e9a5696c76faaf73c7b7d5aca889fb9b84be1 +size 520841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ac610ae55a..a8c272ac1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4b5ce991829df0b1307a7d1c0c9bbfc1f6d2af8240a2bbe7f06c1a0aab4620a -size 469444 +oid sha256:77886fdb6f75e97bfa8c9f44fd4146738d5ff9067f9d581838e3fb8e0f94ac4f +size 442147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f5b3ffec70..9ae804b56c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c214e3e1fac44961dc0118aa1a4aaede7c70f7553e4f3561057043b550fe41f0 -size 387000 +oid sha256:35d933bd21ab72cb5e1e0bfa02ba496dc504ac3bf57ebb783704625b6f4af019 +size 371519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 35d9889138..a1e0e26afe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fc0ce326ed7572fbae58ba4e578c5eb56e892a3f7a3dc8d337cd6066e9e9502 -size 445072 +oid sha256:676dbe9e4e57b41929aba70ec75f557c33aa34f747079e81cef8ab9d50d1b88c +size 421721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 439ba9b310..f3f80b99d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d4436521aaf2894cb0f421d91ed3f1dd82e18bd8fe6bfe7d43de69872e72ef3 -size 368154 +oid sha256:f7db92e0834362c9e8578e54f3f1f186c8d1dadf43217b5fe4569d755923afc0 +size 354249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 896adec55d..41e7afd475 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ada999a68a8cb5087440e36e4dd0372d8aeb9282630c20b5487d035ea1421e90 -size 462608 +oid sha256:6b20b3d91806d27bf8d58747760645f22981b6dde00ea9f3850d4ba111f72b18 +size 443203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fd58dc714e..f9e79e5592 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46f890eff48e6a5fbd72de3e81c6f835656195e0572cb8e16e7f6c1f7eaff173 -size 444550 +oid sha256:e9a32c46b52879838f2361cebeb71779bebb10d69e19d128c7d56f004f5870b2 +size 427463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fb532c9754..98d0045e24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60da2c533012bbeb70f3c73457d5e3efcea603a4b735ee2da16bb44d9a1410e0 -size 456608 +oid sha256:e82e6fd46269a1fd958a4fb3e163b5899e1d8f632e6285f360c0c406ac71add5 +size 436389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 78c796f0b0..40b4533cda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1092e4545dd09b29a57d16d98ae4c2075c6d7810dbb9127ff7341abf5240449f -size 440128 +oid sha256:3fb3729113b9f962acd0924d59169a3c4aad6764b95a63975feee69db2040362 +size 421489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ed40463430..d533e45a37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b72bfa8469af7276ee6611ac085b1ce7d94b31c0684f8e97aed8833803924828 -size 615412 +oid sha256:27733366c333b0214a942011f65b6a6282be7e85b9a9d5b0af1caff9cb4e5d1e +size 567641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 77f14a561a..e7a09ca3f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46d751177362348e4e86cfd4783ecb66dc1f7371e790a4654f2262b77b46e3c5 -size 556746 +oid sha256:3de176004882f22541f756d29f593ece51e1a0766499c9953a4fe4347bc52ee1 +size 509789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 616b09493d..b7bfcb2891 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfcce29cce8009fa9b3d96126dcb9bb1c428042cff52854f433ef37d125f2152 -size 448132 +oid sha256:756da442f857dce6cfbbfe47a3235f9dabe55bec2ee4e83b60078a210c513cb6 +size 429517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e8213ca8ea..4cdee160b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ef6e98e4a71a5435704d476810b1ddbbcae73c95a0f75d2dd351cb468ff3d5b -size 371214 +oid sha256:f6a09a4c5d152d159ff88987ea09b745e2f6fd6679e19d7bf60cd75891004ff2 +size 360467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b83629c49a..805fe10024 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:543a3dfce402841795fbe4394f1b774a62d5a23875664a0b21a51db04e6c231e -size 423760 +oid sha256:9687a75658a1e1ea88e9eb7aa911ab1c15b7365e037d8283d3fd9eb2602a9904 +size 408303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2b1d9b4834..dcf39a7988 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36c0d7b4903eeb739b54e5ba81a4fa0bcd076b9989a6652f158b6fe75bc231bc -size 352366 +oid sha256:c9fef43f5cbc60f8f58c55bb2aee37e530d2735648c6f37d292c94a51c851e3d +size 343199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1f1f79dba0..e5fc802df2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:031daa2bd1515701fec5974a73626e32c2f5027a1ce6a451232e6cc0950a29ec -size 696665 +oid sha256:d9567f4cb23163026912ee848b2ebfea5bad360db157e71fb96a917c508680cd +size 698622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7b08b429b8..f18c09f5c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14df302b56d4cd425ae2036671e47f458b56838192a5bb1c492537754754c9d2 -size 611360 +oid sha256:c26b38ffbc98dbdbb626eb03f9b131a34f38c113689d953d2852024fa1d058d7 +size 611195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8c0aef6b6b..1fc265d6e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd48b63096ea2c7bb2e735e7bda43690f44a827e9db75a7e9ee019cf3f7d942a -size 692865 +oid sha256:7ba13eed85c0fa6b8ed7c4e980e7b4a8f7d779b9a93c7b949d04cbd0d121eed8 +size 695956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 84c4bab8ee..618b5fe5cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:811ee09d7b631a0c23b7bc60af9ead0fdf725282bf1636c09a8a45deb2151e31 -size 610124 +oid sha256:fe21cee7be20b7fcbcbf0a5d0976a1a9a98495828838b06a43295327b1840ded +size 612031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fd9cf1cfe9..6ad4927c7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a36ac1eca714524566a12c269c087d77d569d5fe5301c2037453121564015cb -size 763889 +oid sha256:3661131ede5441a740210445384f71c964375a3220603e1ddcffd643cf4646c3 +size 764910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f53d413540..16cdaee58f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:222aef7c69026bea35821c2c1dc29359faf0cd7be0a9eefca1f8a4e34eaa7e65 -size 679621 +oid sha256:a133ba6b87d3e233c34001ba429f9ee6efd050105fb50618ce421c2c3640013a +size 678766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5c233bd708..1fcbb673c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:263da40cf8662e1c829562d9bc1f52820029dbc23ba95fd0c5bef54573a69b3a -size 805201 +oid sha256:d9fcea3e930a80c886235d7d923f9a4615f568e758fa2b764e8da4a243d12e3f +size 799660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 50bae647a2..6a244416c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6d2f405a52e558112ee087c5aad5d00b8057f533c25315ceac797d93880cee7 -size 713779 +oid sha256:36bf016a786a33ce38839348f121d76ddf7794375c46deb21e672b025c8fe244 +size 705276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 233ebe8c2a..7a63a7d925 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60caf63aca97b81c56deccd583482353fa4aebff720cc0e97198acd87ca8c163 -size 783691 +oid sha256:798a537ab131ee6a546e7a6aa67ad432387aabd95ba65e7bde2d2d67a02abd6b +size 783280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c14ac2a269..fcb9260b81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e42ac68aac8368945c27963161743ca76d26c198991d887e1396b2b8f39ae62 -size 690887 +oid sha256:4b9497471beb1d0c2429423cb7c8c8a5343c8b4845334fc2317d50bf2c63fa68 +size 688306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 706d1ffc04..9a5d5a1bd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d3ba8359a2e6ca7a7b331fe5529f81bfa368be425c5d81ab21bacd0bf2bf603 -size 797405 +oid sha256:88ced37a1a052f6867a1fa3703728e64f81d5df202a938ae82416b1eb8951d19 +size 795366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index dcdaef7f20..286e0d436d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:275da2678929139b580dd638cf537034999d4c7eaf7190181b128eb4df34abb4 -size 712889 +oid sha256:8175091660d699a109f6b32b15931704aa6f1e42bbf2eb6d4970daa0c584b851 +size 706754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 135f895199..e1605136c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41ae2d10098b9846d7aff50140fa11fe4a87928554279c3ac143a568052be5da -size 777967 +oid sha256:3fb9fcaef238237f0d76c13dfd76f74b0ca7d5ac6be3dd5355f0e263ba01bbff +size 777902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1d306f5298..8f339e7d72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e63febd9ef332f30fdcd3339e9629e41ac1eb9a572dcc6340a278cdcc5d9d082 -size 691083 +oid sha256:c20a849b0b56a7031dd1a8871dd131d560cc3b110c3aae512f0e453e584d3859 +size 689982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 862f08678e..ccc057cace 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0caf4347212780c0389e54e456721e307eb98cfdc47b8ea433e4d48b2acfd22d -size 875533 +oid sha256:3e67cd3d99ce265a22a4362242a6196b34f4a3c939086cd0adc4b1f51bd12b5c +size 869004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c8a884f591..d02b57628c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77121865c2a6c48ce32e07aecdc8b2cf2d36767527ff236bba4fedcf4a344bff -size 782483 +oid sha256:cc09f39c501f7bb90a997349d8a58a31b912e536dc91fe6d39ca47e9c28b4099 +size 772748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6c83a9d3b5..ca18438b62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a7e7466e31fb25516efe24aaeb8cbe8d8214cc6cb5faa342cf55692f9ceefac -size 854023 +oid sha256:706ffa57c6d3a2a7a1b4ce981cd8dd07207489d025206b8a16dddc0ec977a1f4 +size 852724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 512c93324a..2de5ccf820 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:210c18c8bd61934e64b0360dffd50be4836706b990298686e9173734e91027a0 -size 759543 +oid sha256:a32db12cb7b5bf6a1e93d15028b8d1aa0a02df08e4546f142af58d27278ae93d +size 755974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 761ffb918d..0e4cc9ea55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:849770296445c7561b97a7b1c88fa1906548d016b71641e94f200906b2921407 -size 658433 +oid sha256:fd890c43506e8522e623f51f368f5a91a14eb02a38ea79b86f13c147ec994df2 +size 635082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 819b8c7d3b..1855d2c8e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf870ff3c53b7f243d175378468ec713626ecf2b10b74da7a86c8a0e4cd0a731 -size 555810 +oid sha256:c345ce0201af3a84a8b3a436839b0297348d6738610423cdfd9bad78cc67cee6 +size 537885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 7a1327a357..6232ca7877 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ee16264d45e8dbfe41b3ff219b4b9e86a50ca4a17f41d2910c87a6e02d4791b -size 655767 +oid sha256:c0da488b390357bfbab867de8858a3e89e6764932c916bacf0fabddcb10af2ee +size 633500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 3dfb7690e2..5af7c83655 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4f8f6806473a97134bf99a523e11244e8784f1ad6bee8bab33462575e40493f -size 574506 +oid sha256:9d35150321536de65793c0a383504e555f5aa1c29e1d897d6cbba5f748996cae +size 553473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2329f33b23..5efe29e6e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd71af5563ab1fe0c2f7359c0323cec9c6e604ba57f6e3759671af008545b471 -size 724127 +oid sha256:df7f61c51c59098127994e474de99bf4a29223a4a544c77534a6678720c950f7 +size 702206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f37a3ec390..cbad45bd3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37d0ac343968ca5475e30f491c07dcf608f33f2dde8a18239dac1c7c3bc9e632 -size 621703 +oid sha256:38609d8c56cec7102aa266b58400e8bd43543fd0de63007163c1e130651a16f7 +size 606195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2af2fbbc2b..4a2ba3947c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63606a3ed9b7b25350517eb02d684f0b37790fd55dba03b17fb05ff2207f43ef -size 801551 +oid sha256:7039169acf9cf9195b15b4aaaa5f3079df41b6e2bf4b9a065a530cd9c9113c82 +size 752200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 542e853f57..7642e85352 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4159c09b75b723f4798905a0a7927b122d49bb544797df2a131651780981c6c8 -size 680331 +oid sha256:f102278652a69a95ce05febefee8205c3159480ffb48779d0bfc51ed4961a870 +size 646472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7bf51f6388..f25a57a15f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21ddf03db65e23b7de2104b7049fc5cac970ad6fba44a53285489116faede2bb -size 748073 +oid sha256:870ac46004f52fb81bf23e93c13248be6d34ccbf71008112667fd58716ed09b2 +size 724376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index edf7720a32..736ed0f4bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbc0e982735c1b8c78d0e0d7c74b2ce74d86d5b79e967dcddd72f9f3bc2536dc -size 642787 +oid sha256:ddd54f6149a73e52d8c540eecc4126888fc2b85545e43605f88717514e3bdf13 +size 619436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6156873d1e..680a74ec21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:341de8004cf8a559929fdc1837a66b3fdf45eaa4ad63b2fc373b6ab4c3f4f976 -size 804311 +oid sha256:c446e693a3b82806a48d9248606afaf588ad646e594474b593b455e285218300 +size 746328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f159a4a388..ef3f96931a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:062ff62c80c956621cafe06523b778392efa33fbcf470438ee690f6b5bfcc5cc -size 709829 +oid sha256:721fb883188e1ee050d53afe8e47e84cba3d48ebe38f1a560da9322062dd3d47 +size 664328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c530d13cab..4d89a4646b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:179bc4f2ce0dde59e7ce91cd9866dc83a8fca4570f1e383130e57e51b2c784e9 -size 749205 +oid sha256:ba540d83b389e32e5bcc915fb066f5faff0de7a65fde37c87a727ad9459b0efc +size 718504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index c3a8660cef..6ebee88bbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1adf1cc86bea530d7cf5620881c32ef7029cf657ad125b389383d3902911be9 -size 665429 +oid sha256:d63427bfc0d6aac1cd65da02f739a729c8ec09ac54ed90414aef75b57b52d63a +size 636554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c1c0f726db..39a188747f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5cc0867b614d5a4ad0f1c425b5700ec885fd14006efdbe85f319e88ce7f7246 -size 873659 +oid sha256:097460a7e90316804b8a1732705819d04f5cedd6b53e7b90b6036de021614d10 +size 817204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4768927159..3543a48083 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9723755e14f30aa7e807cd059e5e989efc6af8118c97f1a4f1b2b0292b5aab79 -size 748295 +oid sha256:aceaed3fa12b425a9e009c5dd61706f2863950e7215d22ddd3a7c53b0a6f73a1 +size 714632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d43402b19c..3764fb2e1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eab5a89593e25739e3777f2dc95dabff7829da9ee62555bb03185c2664a8a58 -size 820427 +oid sha256:2d5d0a1f6b2841001b3e76d4cf9d2c74e4c270b19dc7bb77df7c81625657d384 +size 789380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e29ff32414..5c675f3328 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc822b1911ac07b1cf4496647283472410e7bfa74e89196c643bf00bf7fe5f9b -size 710751 +oid sha256:2465d774a11681a36aff0f1f6c0cb33d7d71a17f27ab111b79770812ee6e8afd +size 687648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2b6fa6d4a7..ea24b3b2cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb1a392dd02812af41e9e498b2179c51514d0fbe90cc8d584f1b3f24fefb4d47 -size 666571 +oid sha256:63391f0aeaf85c54790a8979e6cc400b461ef6fafaf303486ea3f5327260915a +size 633254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a6bffb7239..c48e91d299 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d46224d108f723d8780b7003ec0447c20a59c5cea117947555c13522ee9efec1 -size 576232 +oid sha256:16515279d9da70c332acc798bae6f9617e8e072c172687d59bd76eae303c6170 +size 549527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 59d61af494..681dfcc010 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7df605326f91d9ee1788dab7da0d2d3b48634d1034387158e7a465e76f842e72 -size 661339 +oid sha256:deb4a4fc20cd964e2a109115d64bf9cf44ec8393f9cbf05b6391f1be54a2b311 +size 630686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index bdb367c7e2..c101d497fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7ce50c0c4f2edb3c0f063723124d6436c9dc51a2d04610b6279af86ddb0e238 -size 574356 +oid sha256:893baba61bc5cf01bf69323b0ae536712cea23269494ef60772cc189e5c65e34 +size 549229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5893cc9d08..39e0e5abfd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a9f35980b8e8235602c7c228d875607e5df447a0fe67a4921aa7acd24a82e53 -size 732955 +oid sha256:764ef92c9e896b8651b505cd49608370da7b19e0933bf59286354cccb5ee672b +size 699540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9373765413..a870dd9fe4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f33f889626ba443993d71524af86ebb4754df46b7f59593f294e5b2fe0d1e2c4 -size 644099 +oid sha256:aa9a797757065552bed13702f6e5b33b4338c0ce9c67e757da0d12eee30b6340 +size 617639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5ae0e43739..8a908c3a80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ba21218786718760d2cb256e8fa102cb849e8b0082b5c3f73b537cd2128ab42 -size 758185 +oid sha256:2fb179f7c114dbda5cbe836804797a10ae60e5ed0216c40e37cf3e1f99728531 +size 723930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c2aca0b604..855abc6f7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df41ac09a07267980e9b8b70e89991e8782843de9223856fdfaf1f8d03431406 -size 663357 +oid sha256:4b89efb029a0878d80d0ecaac3fcc7f17ad3ec023a35ecc39ebf3cd8fe61449d +size 629548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b69d549e56..79cf1099cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2eaf47abd568dad8952c1bc8ea4790446eef76a09b8dee41e14139fa69934e79 -size 748465 +oid sha256:ad097b242e8133998d7c9c66109c44a519c761ef76e6de8d18591826a287a0ac +size 716778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index df5f7e5ee7..29ff01febe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b29f36529e24b2421b5e059728da7fe8a351a5e63907d23fe26fa815d784c13 -size 654527 +oid sha256:0f3acc98f6a78618d1caba78e722600cc6a0843a83fa2c387ba8f766a7ca5a69 +size 622394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 03e21a686a..af7b5446ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dee64b452924167c5618deebb0288cb0abc6b9ce9b1b3de306b5c3e8f0902ad -size 752163 +oid sha256:01fbce48e451b6843bdad038aba336beae5461c62259e3425c3a6737808deec2 +size 713372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index cb06970ba5..8d13556991 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05f6d16f6013c48ee2524dd58f04f8303db457d4f5f6ce7dda627d6c6898ddfc -size 661629 +oid sha256:2cb6cf2990630686421c0fc477706146546718cbcbe70724ff68ac338067000a +size 630336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f22cfca820..195a02c57f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ceb43f8aba6d87ff849a5313fb819b56394c3cd1d1fcffc6d48542a0c263b96d -size 742347 +oid sha256:a8a690da195a82f28b5f39017745473f8bc4676512ff73206fbc94e515485ff9 +size 706316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9aaccd70bd..d79c115661 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd57cb8a2d1e767bdfbcf6fba9a481cd0d6a094faf1c0336b20d3cb9b2e9bb60 -size 652799 +oid sha256:4b4e42aa360365606e87f51190aea442da1e12776e8ebefe7ddec7623f966c9d +size 623182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8a435c80e0..b38764cb98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2c25ed158b6e60369a4ab9eb1e042708a5c6c45e615d888268ced19ede544d9 -size 828813 +oid sha256:a6cd12828f0ee12cc8b455e844a3db829d472f1f3fb2f1fcd0422ec279340e8d +size 792882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4257e7b33d..3426e74755 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf9cf5db499df335b0923570d3852532262615d2125f24d8d7910a3d8dc44e71 -size 731371 +oid sha256:543ff061dd953b6930d8b5f202ab0cda3eaf8f5ea5111b789bb86240c56310fd +size 699238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4c11319dc9..bd3f19f643 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10a7dd5e5732b2822de4d6626f2108c8513291682593474c990ac7845c2aded7 -size 819093 +oid sha256:2f40a18420d36ac16c8730f6e1fa379f43f1161ce20f23d3557e43993b29fe68 +size 785728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 04823ab417..b85b4bfb9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96df450c57c935cdd3b9b5407e2c48b328c387ef1ba7b66ee9f858ba381db5ab -size 721751 +oid sha256:ccbc5651cc8d0568a56293537070b5b0c7c953ddcdeefd235867acd2c337880c +size 692134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 80d6cb6b9b..df59ccbbf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fabe7ade0d6c6accdbffbdaf368f184606f6ecbfaba2fde3a6e690d8e67e4dc8 -size 731903 +oid sha256:990f4c9979eaae85d1c0aa9ebf92eb3b54a35af95354b8c6cb2cc46578bac4a0 +size 710230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index df5a017b77..3504b48e84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83a2c24f8cdc4157acc8674b9a917fa97e37937a1be3738a03103c48e9305780 -size 640185 +oid sha256:a7edcd0c8e0130b5156fc0d7e017b25d7259dbc8dd0091200b1101e4ca9ac58f +size 618412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 075cc7ac73..18769abc4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58acf192ec228da3a66c7195e8205b1078aabe4c5195c1a6a2a9698e0fa623fd -size 728103 +oid sha256:762cb7ba1e7b7c2c55aec02c00976bcb291a6091408d3e1d67eedbb3d48fd42e +size 707762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 8497c691df..9f5654164a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d08d1d5adea95a9a4b236c5a322ffd869e6bba6388560dd4a5503e3ce010460b -size 648669 +oid sha256:b540509e705d13f043b779d9d55539ba440fda64072759f1655e7cd3a6eeb723 +size 620630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6da69f366c..b32a925ad7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ae997195a54cb2ae499050130273d896c7a06d204857bf1ec5304d5601bd870 -size 852477 +oid sha256:a9798b0148be6eb5ff79823e0d46e6dc0b167e205c648a8afbef594f32d2c226 +size 816002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 43a9983718..e6bf4d87c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18f412b8f66b6f1bbcacf15b47ad82d913388308538cca1eca089d29a2eb5fc4 -size 765593 +oid sha256:82630d311551b7344073c96d50ac3f5bae6b6540b32b5d55d6dc2b80951db1b6 +size 723790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ad834a76ae..1afc79f924 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2005d9ae482d86f71a44b208063f277e15ed0bd2c854281a01900faec6efd749 -size 822581 +oid sha256:2f83010a469d3237cfebdd322f013faddb8f02dfe480bce4787749630d1aa693 +size 796960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 867dea4d49..598fc90551 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6113f3b7e0e1397355a8ca9812909f0ea68b4f9d36fcc0956ecc35af062a6d1 -size 725583 +oid sha256:5f6423447714d94605e08f7cc8fa1ed8dd514c9fbf91be4870b5456c3b9da35e +size 703120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7d6e578bc6..5b62324ccd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d096cba7b9b0f10ce12b168cd4fe7b0d8c4988f784d2bdaa27bc687d6b31b3bc -size 847343 +oid sha256:ce51484f519f80763ca7588eda757a86b462952bfc92def59617c353179cf6a4 +size 814028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b7e39c63c6..1da91c3c91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2fe81a8fcf7957bf7d4f89fcda40e436ba885225bfb5e19fa82be60fbc82aca -size 771807 +oid sha256:0b790c7a888175b3ca9d87d13f57f8e3bcd2eb23b5ca28c71e3bad13efec2c0a +size 742288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6ac89fe47e..e63dda77b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5b1f2fb539e0bf8a54041ac64e0a3a8e2e1023d58bee758810f3b19bc14655b -size 815179 +oid sha256:f8d637796b60d9726dfc90c5f8e2f101e61d35c8a32384a41a4e3703bacd0329 +size 792666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 96969444a5..cb8491ae0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6fbb7833d3d53e1d590738f3c694622fc8e8ea9148beb56737de0c5a509200a -size 736779 +oid sha256:2c277ee7870bd1e26c39d09f9cf669648d853c7c5bfe9b2a02a7550f2ea085c7 +size 715156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index eaa235e070..7978646f4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99b5137f9c0da7cddffc021c23b4990aa89d76500e934e7672c87ea460a0c009 -size 732697 +oid sha256:79ae2339c56d0e606d6fcf5d8ce92bac9094fdf2a446079652e693f4a6f218ac +size 710234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7c8b43906e..dfee385e4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ddcaca9d2ecae30d3fbe7caabe95002f9b8d3e31e006f14c40ee7f25b9627fe -size 640979 +oid sha256:f39e6f1411b2de3288fa6900fdfc9c1d50e24122f9cb41c0bcba59108cd177e3 +size 619206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8e716ddb4a..fec20404df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d34111162dc390c60b1796f3c8364cd76cad8a5a3a653d73c87c15fe64c8fd7 -size 728897 +oid sha256:cc27457059ac42f56f2fc0e2a44031826c5d322633e23692d0360485177d8ee9 +size 708554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 33ffead0f5..3e53fc2383 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d67be78d25d25ccf4157d274a4248d0ffd82b6ffb3bcd913db0af6f44c5f07b2 -size 648673 +oid sha256:6439472664f20771f935da07e0ab090b2dbd069d883f9f34d05b5436890addc2 +size 620634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a29945c934..18fa8bf8e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36a1be13d930dc092462fb675fe75be97a59b0352beca12700adb78c3deebac6 -size 662125 +oid sha256:9927527b3feb71496336cee21f0ca2198cfbba6fd20f24368f13fea37c53c4ba +size 642030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b8e06159c8..68794aeb17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3a0637ca5fb5442df68f2ec93d846231e4e9ff831dad53a17e0f896d584e78a -size 577064 +oid sha256:b49221d0258c5c56f4149e70113144224652cd9a412013268449ad80806def71 +size 559437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b251d7b9f5..e24346b0f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bebb047315b40fd6ec4ae519bd72e8a1a45b3041c3939605fcfa04db0430306 -size 659113 +oid sha256:34137ca325cf157bfd3ae49f40ae1f9bad9a3c9b80ffc78a69de962777213df5 +size 640498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 72e8033b65..ca65eebf88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50d9afed9265a4cdf1f197554e8c4dc0c006f8d03efc1444e88eab99c17e74ed -size 576866 +oid sha256:343362941fe26dc867f9d507342bc2e91cfb6369ada1bc1ec684785dd1c1fb4a +size 559779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2cf015033f..80069412c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c03589eea805c3ebe13eb46f20833e82a0aa31ba7985aeb73e597966e5023da -size 729349 +oid sha256:3edca6fe273345244b1b2841426d775c965415fbb3bdcc508aae7d6bc6081d10 +size 708662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1e1303dc89..3e17c458e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f46a0089389db8ee83229008c57efc048b08443efdc4a7868460483763562798 -size 644487 +oid sha256:cfdf0695ecd30fc9bc093c5f431db4eb6e87c73227273993ab0176906a236105 +size 625626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c51d812d41..5dfe00bd3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7cd230c02fe4d9e56790a9e5d3b5e5cccff60849c511bfcfc6fa26c33adda26 -size 750137 +oid sha256:91e460ba9464aab7a187a2c22abdb4c16eab9d7f84938e8d5d22b1ff292ed09f +size 727724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5064578e13..8dbfc0b25d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b585b2dac691b662a21bfe6edc6e9b42ecbc8f982d195a0cbd8e951f6df8919 -size 665621 +oid sha256:1eea88588a1b1c11fb7ef6254cd171c2e174072fe8632d2b4c354c52fedbc2a5 +size 640248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4c5eb37244..3a316ca425 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc6e9e7f35e72ad5693e411408203255520f662755a6508bebbcb79b16ed7bf0 -size 740419 +oid sha256:796c486036c3b2c312685c27e6739ce9170f520fac70ba2b55bdcebc74d4ff59 +size 720570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0336fcc285..5cda6e0b67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac103ed66ffc220e3a7a59190d1f35185bc10ae4c616adf5a94b814dcd0de1d9 -size 656001 +oid sha256:0c7fa6c11b3d0f4f7c893721859d5daf1b0c7da1e0c4707f3acb21cf7e533e55 +size 633094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bd23aab51b..62d3449500 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d8cf4ec5a82f63d1fdc0dc936ab4c86ed8193eaff40b8dc7baae1298ee11a42 -size 746139 +oid sha256:d50f9ec0ab01ca8ba7c2e443b2ac34c25a6d8db71b1c3105ba0a4ed52e83491b +size 723874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7197b0ca7e..cb2e3e540d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5798b07f94f584266c2f9937b88be65c042e83a39b20fe665e9e062c559f8ea8 -size 664977 +oid sha256:2c8206a3416a1b5f15395077488bf9326adb3dc52b4a092aabaa7f2b3cfaac14 +size 640886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 30aae4de23..d87043478e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb940c29daa1b8c76fde141dcb9be33d8c7beae9d0114e7341de54804aecd281 -size 736173 +oid sha256:13e8d46b4b45e80a107a5dd2ad3120e27c3b2177832d8d31b2518a1d233b1f72 +size 716670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index d6ab9941db..37db00ff4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21c6c6d6d7bd57c90056bbb79894a6f5aad66ba65bd26e6d380885e6dddb4f98 -size 655357 +oid sha256:02dfc30ebe6a5bbf16784cb7062e5e7049980c03d033b0db38f38080a2684d48 +size 633734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 67275cbf11..8523f49d8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c63e9b2ac01beb7f03d514e0302927197c3e09b712c7473d5703222b67c12306 -size 836403 +oid sha256:8668e859f3ba9c99590a617c74c24dbea32eb2d6c36a6b8c05b37ca93fdd047f +size 795934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a7786cba05..e21cd98526 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9487da4e646d2e1748c452aca4572f3a40c84b4d17fc93d8126444837866360f -size 746363 +oid sha256:ebda676ad0071b148dda56e02284a7084b47599dce743959aae7229d96ca0a89 +size 706336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e5880048f5..b8c8517769 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f7f3cef0466602d31d7d08ac4b0f684862ab569b7caf66c0f11e16ea233d2f5 -size 826635 +oid sha256:1cf453b1829abde9f8ad5ad469b6761a8d45fb1d0f5bbfc37825905d9f9dd856 +size 788780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 749798e07a..1a0599b09f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82fbc7c4734782914331421301b5658ca3d1f88f999d38f76963b9d34401fe99 -size 737531 +oid sha256:0e33d51c1ccfff4280b0e65f716e8548bb21de2f640fd246cd57a6e9ecee9eb5 +size 699184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 291121f851..4b982c6c6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4e2e68320c1bd21a59dfa3191af359f6e9e65f6503c8e5e26fe55c846cc6604 -size 630057 +oid sha256:e7375cca165c75d3d9262d79401f239c7ab358685fdc14c6e3d99a8e0ccc9d4f +size 612873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5597247c50..6b8950fd82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9549947d20900b9e08ce1c14af7206cccd9bd0e43599e45abf57e03b86b67a83 -size 542236 +oid sha256:4e3ec9eec6ca85d0cfef9447d111f1badd241db7a7d5626f1622b5de8b639e69 +size 525397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9f1b4c7491..749170d279 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cad79130b1f9ba3843c334dd849e64e38c8a14a20c586225e8235ad4e6e1b640 -size 631535 +oid sha256:adbc50393aa25858f41975cb58010d05d46ea4053788ed21354fe1de133adb2d +size 612623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index cbfbf0459a..7ed16b966f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef3c911c051ef8a6f3a5bf8b425a73e13115f474091619653bf466a147962154 -size 545440 +oid sha256:0cf2d4295bbe774d7f288d597d5a46367ca48e2c317e0ea2c95846f3875d2c27 +size 528699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e1d4690845..7c0f140dec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75e7aacd81ecb41983ad07834b4dcf7da78218d8ac7725600114182b034990f2 -size 701475 +oid sha256:584c3fb7272e23974de8e40e3192dcf38e7bb3b52d230ce7658bcfdf28bee00e +size 679998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 712bf0caad..0f56125c16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aa71185ecd8433e0a999c31e56cfa10a1c5ee5fd84305f519864847e7610499 -size 608868 +oid sha256:4bc27419654d4eacf5f9c6834acb0f3800a95fbfb2c04a376c4705d7f1c0d0c1 +size 591881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4d84181ba4..6cd0c0ae1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50de25c050d456c8c11b2a5bb1434b3f9384cdb4a114a52584c23470258c7637 -size 726605 +oid sha256:9504620584c2fe93adac85cdb360017e642dd97b7b7cc26343d25f27652c59ba +size 701132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2544ac1763..af6ad8d138 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a1d636b90723275c7c6510456937258718f9b162ff3ba42d378261a31b52a5c -size 642237 +oid sha256:c621f6eedb8c7629925813cb0765346711ecf568e998e81b7257bec473353639 +size 614987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8d2a96f4c4..d297f30008 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc3e66417f53f8fdd46806064636988e7db91497344f759fc9fdddc56446d349 -size 707119 +oid sha256:2541b410bb5a6f4af7dec2ee57e7bffea4b7f80f3225e77e2ab8f565765721da +size 687616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 03f1930f7a..0995d6d0e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8ddbbaca73691eaf8993f05b8ee83ce1ce10c89ffe07fbdf6961c8a3ee092db -size 622899 +oid sha256:9fe4113222e0ca009c4a3a6483b0ef6164f96573137d7508ef9eed7ff63528c4 +size 601471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b012db0537..b0319471ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a0dd0973ea59ecfa68605510be643509b300922d2a948bff7065263df1d5d51 -size 735483 +oid sha256:eb39e519bec0a87bcee4177d9392751a72833439df0f99dcb00401f06f4d572e +size 701772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 51d01a5172..a18fadd3e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f4c0af046c78f9ea835839db10634480fe192531b8b6afdb695aba2054409c7 -size 646823 +oid sha256:18433087bbf910713b11d92dd53604912a02ee586b48f28507b0382e1adc36d3 +size 618094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ac66ed2c33..94e0ec850a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:166b189d2d1615a4fbcec1fff4b9f4ec50a45e82da03b79e9bf694d383e8d726 -size 715207 +oid sha256:8eaf7bfb1f7a3babe7ea392ee50566f7457f426a6632b3253db2f6918544bffd +size 688254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1a6f621916..51a06ac39f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8aa706fc064394f6ecdd5d75ced6b0c7ec5301a1683f4a5c91d7cccb6dbffbe6 -size 627485 +oid sha256:8b9e36467ef384799f2fb769fe67e2052a4619a4b6ea97e1a6a3922d96563947 +size 605365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 394a15947c..1c9480bf95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0e6ee16b6bad68bff9ff108895ecb370fdec39056296cd3c925b668b15ca578 -size 819187 +oid sha256:473ed7c7e386bc27317cb3003b5b8bcd2d2891ff59b8706aa5c2392b9cde45d9 +size 768258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 62cdcfa76c..9a8de535fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3bc3bde0dace12101a08995e7d70c54c0eb421760143780709f5a431ec85813 -size 710349 +oid sha256:0b0b6a328fd5e5321999ff1fa5e9941db3545eb28913477859beb8ac0324ded9 +size 682360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a5f70d1a8a..83958e0f23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11fa2f48dec1f0a0847056a6f7b5014ae2f18687f4965dcd216344a6b213626c -size 799749 +oid sha256:283db6e181b4eca61e65be2029cc6380d41a60c5671ceaeba0236d6fc0c79d0a +size 754642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e884002892..57e3d1f82b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:798616cada5e440334caadc171c4fd91d6e0b76b1b58f97b2e66f757c6057d0e -size 691799 +oid sha256:67c9ca9e94f1146c6c54c2c9fa44935054679fc06948e50128c40abbdd8dbbae +size 668844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c54e7b6393..572dcccff0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:454d924f0f227f1c1aad29cf5b0c27cb621945e690d7893475a3c3af3173261d -size 632079 +oid sha256:697bb9971a15955f828a9457d52fb95b5218cb828df748ed615e177fb0a2fd62 +size 606457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7cdab204a6..bf32198321 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1708c5d1689a5038f7a561ec5b6d8d580124a7a624867342f8b0078cec37d999 -size 544602 +oid sha256:d885d840c891a38cc5235cce10a390a5621761925b70121fa6d2876ba5afb921 +size 523717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 51aba9e533..fe240aa22b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6371c7e24164ca3d1e83c153410de9ee42b20e34336138902021e221645b2ea -size 628919 +oid sha256:3343e06d1f3a1f8d69b091d646823542130c2a27159514d13ad8af0bab01440f +size 604185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 68f96a618e..776cee5e75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:893771180273b86234062eed98d3d5adadda621cbf667ad3db8f60a35d9f69ba -size 544600 +oid sha256:649e2bc84a37a7f05c05020a448e1229adfa316aae1c1bcc025440d25c9bc287 +size 523369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index eaae77f54f..a9d8fc9944 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02585c06ba14779dc0ab05f8d45451a323104846153b25c9f2e65d6b4d1b5af0 -size 699253 +oid sha256:f439305c6159fb5e7cd9b41b5f514a3b1a21fbc9324bab18f1500e8b1e97fd20 +size 673090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f05cedd07c..ea48265edc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a59cc8ce8048c8cb18531f7decb0eb6027d559561869172279835366df013ac0 -size 612022 +oid sha256:f345045e98f64494f15bf2d16eab99c5a2d9b4c143e311e970959a41b1ced288 +size 589905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3d0b910c6f..d861a26e88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f3972147a17fc32314b2602b7cd4e0c5df93a4a38702aed3c1b30894cc88981 -size 720733 +oid sha256:5b12525788d4a3b463e10145810797c89705c7120390170a1733918900f61a64 +size 692794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6ded1fd6d0..a958a4295b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4f2fe9d8e4664cbc62f40f8a370aa10da28ccb22b99c0f20c25892d68ca2e7f -size 630445 +oid sha256:dafaad53df807026eb441ab74bfc6b087852752f7d6675b2f2defd0f0bb2e1d8 +size 604379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cae42d87f5..53cbfab0b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dfc0ecdd1ad11773e75d71f629499078e7a32c7bccb6df984d27b7cd4930f64 -size 710965 +oid sha256:b1331b9902ae231353508bb6e577239412bd56a1a46c170a79a1782ce078d3ef +size 685640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 84f48b2f4f..b8e2f26248 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58143bd44e827dee4ab076ac3a0acbb63cc31eb8867321a54aed6fcbc3abc042 -size 620825 +oid sha256:eda3e44249d8c89286af817e0cf8a207a957468372c13309430f3d4001c7269d +size 597225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6befd23e9c..5930f567f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed716a57cfd4ca1773223da9170be32888bd8b766a5bb23c45f508cd75351d31 -size 716685 +oid sha256:a7b87232efdde552be5afae382b3ce1d22534cc81d386d6760e98e23b169b45d +size 686328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b2717666f5..b3fd1d1175 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cab11dae807f7fc288c64ca2517e07e69d59e4dbbc27734b75f931a5c6b149ae -size 629259 +oid sha256:13b19f99c4dbf6016cb0e7c532874639b9e98029896c32fcffd6c5767dcd92ff +size 605167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d9d191dc60..17113ca623 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7450fd675feb4115bcedb83b5cd7f9fe00a17b61251674aac6a4d639787ab031 -size 706769 +oid sha256:6772427f20244cae21dc1c3ce1c6b8d3252e491701eb60be63830b38e995bea8 +size 679866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index bd2a7507ad..ad6ce70fd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f713fe8a9a8056a341ff455ee6cabafecbe20e830065bc18fabb6006a43925f7 -size 619639 +oid sha256:632d81fdd25d81c3618af531edef39c691cc0afb3a1dae7a9a8fa2bee08cb731 +size 598013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index cea43111ad..8b5af9d0a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69fd24963c2666ddcc0dae11dc352241f74645126c3f6254c2c835b69932c996 -size 806161 +oid sha256:67e878180d3febf4bb1b783fb565519ff3edb845146022911bc4523d277b709a +size 761004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 97a4e932a7..bea47a0229 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a03990a2631783181903d8c52029dc8469a1ca588b4cbd1248a920f2f9acd98 -size 714441 +oid sha256:952a6d0c2f7ff1c6b2faa93bce2a927ac1abf7322af56044cc891b50b259a1d5 +size 671506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1ea182b107..b442371e40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4374cd3d0a921860c58af1b46d69420ee47eb46ab1c5593494e71e94d9c4e0b -size 796441 +oid sha256:cd89d6da26f6ed0791d70dba600a31b76dc5365bb00dbaf1e47e0a800133a488 +size 753850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ac0b0d399c..a522de9ba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:647e14f344595f7bfe6b4a6741d6d3193bd254d68b5a531988493e037145f9f5 -size 704821 +oid sha256:0dc4799288c4470a3832e615bdfdef1b9292826bd8696c567721c8b6da523ccc +size 664352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 99783eafb6..8716ce638b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fcf01c3d741a2a1caa8ae1afbc198ae4a84a595f5f17ef7d99b358a61cc196a -size 676939 +oid sha256:9297e72c1efdf7dd7d546f6df84713b6e6f7ac4953fd1a7e6746eb7d48af7839 +size 641106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 23d15740f9..089f7b4d7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fd71197b2bc6876a6f7d59da320ac3eae711d0a9a49d5442e7f34c54957abcc -size 584824 +oid sha256:174ddc893abda6c92a88faceccd807904224580ba20ddca5c2871e29e844b958 +size 555159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e66bbcb649..d4a6bd4fa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20480df206020429862bbc18e257f80c22f936426962b4beb1892eb9b1b15f04 -size 673089 +oid sha256:f19a600c9670248ca8aa7f2fe7f86b0748c85c7ca7e0c8c250be027d8d67ac86 +size 640314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 7449dcacfe..97a16d86c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b23635e92115a08903eb0d3a440eda3b47daf82a354bf2e26e72d220e5dbe9b -size 591876 +oid sha256:a50322fc2d31cbf68d579568306143cd90744cd17488ab3564ed62b802d120d9 +size 561225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 310cf58eee..ea0a6decf2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ad7c3d888be7e0be27586965c4e50ef7a79c9303ba8017c1dfadbc6ffda12a7 -size 774029 +oid sha256:c1826b3569556dcc4615c2c102fd63127310f74affcde3f440549b7f7bfec49d +size 732030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7b9c60eb9f..b29a4e82f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11aa8528e878fc028b73a369b14d83973cb28cdf4f76dd546dbec5fdeea9f1b3 -size 678017 +oid sha256:e9314ed61907da794f4d3a72855b520a7ff70ab180c008a30e16d8181e1212cb +size 641790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e51ac317ba..82aa706a00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8d27b4edc92168853983c12d13cf1c8009fea2b43c40eb63119d8b9f18270f4 -size 760511 +oid sha256:828bb338423b84d7fad0e574a8de31412725c6804f718ed04ca5a2143bd353da +size 721718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8a314e46ba..6006cbf1a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3868761900f130c3e31742ceb844bbb6974befe3686558d0fb884e68b550d843 -size 665289 +oid sha256:9b8f2483eb599128260b6e89c9b093b1d1f9554ae93db10926f942accf9583f6 +size 631480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 309f2d2bd2..723eb6bcb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb7ab45382078006eed5d96e09efd3312fe4feb33246699da7244ec7b2afdf4b -size 767613 +oid sha256:a345819454dc083f1847503ef86c1b419182da6d9b9f03defc6b860f862969ba +size 731040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7dc75504b6..b55fe5593c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3b926c9afdd1c8bee895e7b9b96765f66a929949b64befc73525a3cf19c56a7 -size 690547 +oid sha256:b59408ef7b8af1f891d98de44e04c83dbe1913426223c04a2cdbf599474e4653 +size 655010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e5bbc75d58..b42b2a3c7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:caff3f7e102758559cd68eaa40549932dd4ed0489136da20ed164e81797eb797 -size 754885 +oid sha256:2caa7f5e10672df537b906a6a7ffcb1dc0fb1b8750e2c7f9e79357cc9592ea1e +size 720286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 30c7d4782a..306cd95f63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:176b6d5363138bd92eeb9719091164858b2205bda6a456f2fe1b761233702d50 -size 675499 +oid sha256:118356e07be537a6699eef445c3686cccfa27da4a551006d75beee6c01768d2e +size 641048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0e09779698..ad64429f54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee897f48edf2b6f2643d9f6e4dd0280ab511a7ff7fb4404bc6c430c4d322987d -size 677731 +oid sha256:d3f1265ed4fcaddc0981545afa2b8e05656b6181c691c98ddaf2830000c62506 +size 641900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f879b7395d..a51c3ee853 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:278becb99f844994f830598050d807318400fa92534b7858e8f6355392643d1d -size 584828 +oid sha256:b7b6577558b6949c44eec6b4c07305fbdcdab48a7f64c75b14f4d2e39330b9fa +size 555951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 7429ac9046..7fd65bf0c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2cda4d9e911900875da0cca14ecfe09ca1e75ab172ddab66a1f4d819076436c -size 673093 +oid sha256:96d3a1531ec98b8217712b1a43a874394cf250fc1c8c0700b52633e40059f3ae +size 641108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index f4c327d185..a59ae01f14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e6faea757b1dfc255c2d7863f946d143a6bb48e1d771c8e3987f1de7fb29228 -size 592670 +oid sha256:1dea5b5c5a1c73aed5ca25391eb82b7f371cc14e9b42f38f9f183b66e973a995 +size 561229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index efa848ec6b..6f8da4295d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce8a5e335a410b4730334e604d7e6fcb22f15cefe1ea2a0843fc17902e985590 -size 673709 +oid sha256:7e4a9e79907bf370a185f42bb82620ee830c8b9a1c48492ec07764feb9c83f25 +size 660570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ae8aa93682..bd48fd7638 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aaaf8fe623802029aed7d8ab1360bfbd9ebb53ebb2878da83c94e9adfde508f -size 585000 +oid sha256:47effed7e6626364596ae67e5da0790ef958227ef1562338e644eab24421fdf7 +size 563917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index b558599cf4..b78aaf7502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:604568faaaa8a4da90c248bba1dfa862ac2b1a371aaf0984efdd6a50685bf377 -size 670649 +oid sha256:23f907c314107dad2312796a7d6e321515b13d8bb23e08014fb3dbb3f166bba5 +size 658546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 06bb10eb84..0b0670854a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5836402bf603ce4c87090c1d7178b7b9ffa65a04df6cc43bfd1ba081afd08847 -size 584800 +oid sha256:ffdb78ed33ee61a69c2d32933824761b3206f718b4cbcf1843db387a61c85533 +size 564261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e2ffa1c2d3..58e7e9759d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72ac94c157174e5a02dfd6d200b5bbd8dd86948016f0ebcc27b118ba69b807af -size 740145 +oid sha256:e450325c6f4963885a5ac5ee49c6fdb6c275e7f5470015fdfef274e46dcc561c +size 728042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9daa8ddb52..61cab601ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dcdd6ea9990b70d39f77166ac72d96ff34a0fcee15d75166657b3cd63b71720 -size 652421 +oid sha256:64ac8e7828870c8ed4bbf7c8bc60c9ffc7a9b013c475fae53de9af2cfe25282e +size 630106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9913bb1c4f..552316c3e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef4fedef0c97ff4d5dcba62c3655cdc445e9e9ce13a5a96ddcbbe1a9802be4da -size 761723 +oid sha256:f2779d782e134492b18fdfc570b3940bc513d358d78c71db39e5da83755d4770 +size 746264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8ad839bfdb..dcbeb18c1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b52e80fffaa7e4275947f3c7a950cea95eb7b63d6bf5e9c6ce5e209026e5877 -size 673605 +oid sha256:0c2ea306d6ebfd60794819e29d83af1fdcb4ddaff5dd66df079653213a8190dd +size 643940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ab7d23d056..0982e9e05d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7eb91294bef8b611b7812d2a08bf69bb38a154058546b2f1a1ce0942129ee9cd -size 752003 +oid sha256:5288856465ef90598f2d9e05bb83e6e084505f0ee8bcf73ccf87b59027826713 +size 739112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5f92732323..c31aaada25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:118c0981ed1df3491c23a4c717da0e981062f16a98f71a54dd55b4d1465057c4 -size 663935 +oid sha256:a849cc206c0884061939269c01f492304a3fea1733cd8210cbc16b2293c3c8da +size 636786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 533008821b..d39dafd0fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61ef3fe3523bacf3687fb8f8ad488cc555eef8fb40a808112f09fe3b2c474def -size 756885 +oid sha256:3e4456cb38a8bbcc7e4e26d2f1d2c6bfd99d3b46ee60ee71a9e7bd377cf8114e +size 740294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index ecf6661e93..f586c4ea75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99a2aa5f60f6e21f325eb522e0d254e1868e91ed3899353d0d533c8ac042d412 -size 672913 +oid sha256:15cd38653059b5c9f8ad4a5c69405525799efd0651be9f5d7a5c938ad8b6323c +size 644578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3246a82dfc..16050292de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d1685363d12b008c8dcc9aac6bfc1761900e425e29b5f9a47c9427ab66ea6b9 -size 746969 +oid sha256:0785472ed2d5f2bd2b3eece20b2152fedbbc472b187a8d3d67eb6ad25451b785 +size 733042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9b089113a5..f8c790faf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:778a53e7d1fa8d09ff42a2323f390d2c58f14b9a65c002caa8ddabfaab868451 -size 663293 +oid sha256:e5693647d168c41aec29785ac9775474e0c20476fbd63254259fdf5374d437f8 +size 637426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f3a1b7d44a..2939e497fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:135a79761a31d7a82a491751beb7cb4c78dea9f5874ded9ff1e9097ed62d21f5 -size 847939 +oid sha256:4538a116573a380bf43cbcdb74b90c7c3ae45a0366483df73967897cedaaad63 +size 815264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bbccf56568..08386e78f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc146110ccf99516350efab49bdef56f9cb96f97eefb3de483719c833661a2c1 -size 755087 +oid sha256:93ec5172a3fad383494ebba7302b92d8c29216120b75b4644694f961dd566163 +size 711064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b64bc785d3..9a81517ca2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c697031cfcf3481c612bad9cfd8d7dc53b5779ca7aa7d089771f0bc5d949caf9 -size 838221 +oid sha256:bf748a5e4d6f17c101fc7339de399fe622360531c2d15c8dd2ea443e52bcaa04 +size 808160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e8302980e6..28bf98a43b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65fd483d6091f93ee9874e08ca91f855ccdd556094317ff5ea66cd70e7c41ab7 -size 745467 +oid sha256:675c8390b035d256ba584038ba64e4cce9642a769a74b5f38da8f332c18ae1fe +size 703912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 19ff59012a..b49476adcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37aadfbf8585ed571b935f199bc7f6ac89931804adf568c23a960d3d7b27d972 -size 632813 +oid sha256:6bf37d498fc10e86265f7bbe7e1c07957af677dcf3e5d3f60fd2e855fb1c6772 +size 622880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5f2a8ea4ad..b80651ebd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a2090cff8d6fa18100e00a0cf54586f9b4b12e48e98ed29596dd43aebb7fb79 -size 548196 +oid sha256:1285938da439e7ad22835d30d6583b882d06f5acbc52b08b7223ec81516ee00c +size 535847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index fb5456e3a8..3c127bb1de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e52e3059cbc04a92888e31198e143077d096338e5dfb18b9679499de37c2a013 -size 639667 +oid sha256:a363f6ec8fa83fb18185f5b645cf8660ac605976c3f82427fc4fe4b0233625aa +size 623076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 20af29ee07..18c3e5bf05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6ba93dedc89c8eb6452b86e5ede89b8d5dbdb388f43b6e6019b07c8d80593fb -size 551598 +oid sha256:e7b960b0b7afd2edcbeba47f8520b13bfe6d07e75d5c089d7eb75e70cc04452f +size 537571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index abe0285232..32b5cec9e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4d25741f0d0ac6f2d06875a26d435450797af0e9a4062524fd02f40adb1832e -size 699593 +oid sha256:8cd3cd351a8e41831a10a68482479c067bd3f70876337e6bb81b8c167c3889c9 +size 685368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a00cbf6403..ceeea2daf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2725bdfb24b6827f31d935f477e8ee595f49f15fc60b9f7f3555bed357fba9b8 -size 614878 +oid sha256:f9835fdc635f8636653c3fb1786d3848975d4f2c4f16dbf0b1250849c38ed9d6 +size 601541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 943d8d1505..3cf9c7406e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7263c4d031e4e324c782c5be6de45ee9fb43ba7b1c8806e80ccc49ebd7c6d9f -size 734687 +oid sha256:0682c176eeec25a6ac70892cb73a20c18a24e3a33130ff79ba8e78a2ee9cc85d +size 710794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 29572cea8f..01869cba06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:909a1c0887e00d75500596ec1996c18160bedad324c95d7b652b3a2dfe38d9eb -size 648247 +oid sha256:13bb2051886f606f9b0195b975db7c29dd024de77b3c888680c520b4ed8b93d8 +size 624650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2dd2f91685..81e2a7ca0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:482b81f77e5cc47b56fda408c0ff4b659c05a968ea45f7f747abad9b23566a96 -size 715251 +oid sha256:d97aa3324e961fb16f041714aae7cc753828dd4d05bd84bc67168485580e9885 +size 697276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 78f3c30819..c5a495482e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd6e5459618951f1573eecdb909c6f6e236b6ab9e1214419fe95b0fbc13fca42 -size 629649 +oid sha256:f095eb652d5cac17b56addc2266b056b7b799bc5ddd0e8b4c8286789888d3123 +size 611131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 72bf581503..b2f0acaaf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64d5bda0098d9c9de4889f7690a9ad82eeb91c1fa7ea1699da17c4a99bdba294 -size 743023 +oid sha256:0a56857771cb5047a8ea9fa67df23adc83f97ad21239367c61d10b1d1e97ccd5 +size 712222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4d62f0fb6d..d775245750 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38e72662fc7cd8bc5da33dda102f61227bfaab59dbc896c5ca6011d8e11f96c8 -size 652241 +oid sha256:45c51d234245af0d0d36ae28b6bf78d4bdd5e218ce581be8edcfd9d7bec188dc +size 628546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8e3c489720..68e635ae46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d87c7653627887bdd8e08cea26d76ca529505e36fe7a8976feeb2e9a0e70be8 -size 723537 +oid sha256:287e9373c14a69aebd51708d9d079799f10810cedf34efbc5ee5308710ec5846 +size 698706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6df477f518..d31c0ed41e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:315c529d607dbf00aa7d736516432107e59aa08499ced12db6850bfcdcfaf0dc -size 633643 +oid sha256:7faccdc86e8d8e1078d60c99c91189cd1d47f7ee0d8b9af868d89a1c11951b53 +size 615817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3ebd96c765..a63de0c63d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38c75edd6ebc92525f2fcca8ed956744d4cc8aba0a0c03d168d9d1a31d1abf1a -size 827515 +oid sha256:0ddcad4685176c3a6e1f2cd2597da59571a61043c1524c05489a09abba246a05 +size 777920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 69c12db963..e6dac12808 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59dd4909edfe9d55424ab240e9a6452f7da06c8f7659d23c4e64723dda5f18e6 -size 716359 +oid sha256:a957858993fd3f09b5d04d72fefe6302af2a7c8f7c4c46801de5090a5724e1b3 +size 692022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fb5212876f..c7a82d8eb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:812667b03551276fcfaff3103fe2747ce68683889e1384463f087397f5ec7d63 -size 807239 +oid sha256:71f41aa6cb62ac8f1914f3a803d68468b22a2eb3c035f23ee5bc741ecfede3f4 +size 764304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4bee5d11aa..b0722410c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba54247ac79a464633e9e148b73490bc00ad01b63408132bb3523bff1a1204d6 -size 697761 +oid sha256:9a60a62c0a2be25d02525498c0c31558afde07f065546d5a3d7846b7c82519a6 +size 678504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b0dea7ce52..a097ba4bff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:537a40395338c1171eed4fbf3de79c419acce2f5c14552b739c97b7e11d3b193 -size 680959 +oid sha256:a54e6c04e840785afdd47378407f99675274aa0ae786549aff48dfdc9d1fa620 +size 662394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2b59e7058b..c7a6f8b8ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ca9e5e612f173b4fcbef1e580687fce8a520ccc634386a82ae211187dded1aa -size 549330 +oid sha256:0cda4aa13790cf968736915d458dd3c4e00c865ecfaae00353640de1199c7058 +size 529629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 061e3a3321..28a432eb68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16922d0121b1e596fa058976f93adda327695a0923b0ac3c2afad53ffd857740 -size 684953 +oid sha256:0dda511634915b319fa2b55d973446f68c4828911689cd3e9aae71f4550d10fe +size 668904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b88ecfda95..36c593eb01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f5650ef304cccac96320f2372a440d952f28dd1cc74ef22979ab138dc4740de -size 549328 +oid sha256:04f7e259b7ec14bba3a58177310277f8362ce4ea163813cec320ef973db372c1 +size 530071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e02e132bda..82dd46349d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ea8de7d251af7df04a60dcf8aeae21604a3a150e7c36325007d022cf122087a -size 747345 +oid sha256:f4121a4ae8a4a6c6868dd078d0be3da984850d12b445b191d04cea6a5e69a11f +size 729816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 932f951ae1..8bd7e33536 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccaf83dbdf12f70ec6204b0dd3568bc19856a8cb3b966d937ddd8d9913b24631 -size 615912 +oid sha256:39b31c5e3945e820b61bbc6b315048f363c1438be517db7fb599aadd70a3a3f6 +size 595027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e864011591..f176dfbd3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f2ce0c6af065a90fd43d5d0f6968bfba7b118027004015aa17cbcb8c91869ab -size 770403 +oid sha256:f33be42270faf78931026be42266d333a454fdd8fa088881380733d2556bdb3e +size 748730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 82a835e414..ed201dfdfb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86aa832dd2de3736e52e39d6970597601edabad3318a9a88fe9e49257bebcbbc -size 633593 +oid sha256:5464eb13e110a7d0a81546b756b0f5e18b30a3355ccf5bacedca390c61f1bc92 +size 609501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 76a7b8bc30..e6fb7f4973 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71b800542d9f472a8395586dbf1527dcf61ece1dbc2252e9001f79dfcf937074 -size 760685 +oid sha256:59344013c3f37c64bb096f2804e59583e626ddc1810991b4e0d8323dece13c2e +size 742366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 91eac549b9..52e670c29c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3f37da9cf987906f79c1d543dd10f9ffd2386602ad2b95383adc33d3de68fff -size 624763 +oid sha256:6dd07452c699f46c8512c1e677b509f039b36a901eec94be59e440ab156610fc +size 602397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3df3aaa427..69ea364fb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:579ed264ed6089b627db3f4fe3c872624f7529697d083e86163d185c19e032ec -size 773509 +oid sha256:a4f7a58705167c1c4d092be826367c74dcc4e5fe9ad75df8a8f1104821d31663 +size 752230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6d7999bd9c..c98fbf045f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5651600b8a8605645d487f1fcf5be619d4a187db3770b1423bd6a4157f05806 -size 633987 +oid sha256:0d507a54b461ad3762d36388fbfa237a505f8179995e4a0601a043d6b2742113 +size 610289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3115babcce..f08d1ecc18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cab6ba642a64d663c90f726812146fda2d9b2bc96ce55fa16d99086a02cf8cb3 -size 762903 +oid sha256:dcc330d9fce46d25269e042a4dbe041e3987a06d963e2ef343ef867ef01d38c2 +size 744978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 637f249c4a..011b911eae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b247a55b3251bc8dd8e537011e38021b12fde77b4a98a15efbf14cd0267f41f0 -size 624317 +oid sha256:a8941d42e28cb88ecd5443ed885f3c6481a860d88cc4ecfb62ec7aa747707860 +size 603135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4d9b7ca665..8d57f3f123 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9243224f235ffeb20113eef25281e1ddc599978c5c264e3b3e0bb28e6c77f15f -size 839945 +oid sha256:bf50028192583d78de94635141489ccd7ee79069b6b71d411fc96867b646abfc +size 818518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ea5a842d38..4c0d6c2a62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1942032e88e2183f30849b4bafb718aac55776c1da2568c39048ac1a807c0f2 -size 717591 +oid sha256:615d0c5eb8fbb04d78dcc5d9b3c65fcedc671bdf8f07301d6dafe14df74cf85d +size 676628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 05654ad135..2991da7d5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f94d774c54f2e0860421fca93312454c6f2f003cb27a7b455375b3dc6541e67 -size 830177 +oid sha256:7a6639b98c5a5475542959932d5398992b7197ce2b3dbde8d17194582dcdbfbf +size 811366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ae1211b3ef..db7c3daac6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5608c0d4afbf7a9c5d1034a0a2dacf590aa46b324bf8b063d6dee76ba5cf18b -size 708761 +oid sha256:1b14349cef452816c6d7c33e24756327c8bc0a400bcc0a377537011b57880cab +size 669474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 162bb35294..f42923e0a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5263b3bf938fb9a217d196aaf16b144c7d554777ff3fcece269cc1493575ca4d -size 688855 +oid sha256:9d882664178c6acf39fae0c7c5aeaa9caea41d8617dd94589543a2ac3ae5b4ab +size 663038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4fb3011509..227b51a7f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:697406436f27436ab4463189fe7a5cf5159903195135a76d0123f85a0475e41d -size 604240 +oid sha256:8b6dd7123be6fa295db8a662659d8c4e4867931a13a635a40a708fd5bc0cb3ef +size 580297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c38c9df65a..690b15c08d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c635bf4a3dbc28e697e015deedfc1598acdc03c75c99e07b7b65469f1c0c6536 -size 685055 +oid sha256:6e363e250a82d527a6c090bae5eadd6139a07f8c34046387bab879cc5f24db48 +size 661506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index c138e957c6..d956190ef7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d61b1ef9069e54bb8de2264f3160fe6249f8684cdfd25d6cd10c1327bbbcd1bd -size 603250 +oid sha256:bc3648600a19491b83100b690ae26daa48e20499a212fbdbba7b7b3169be3407 +size 580639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f099b6cefb..778fac98c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bc703e7d58f8646d483cc28f35668e2f7bd2f9c0d2355cc2c00041c38737ad7 -size 755289 +oid sha256:6c25018e128e26180178c63f541566d2ec6fe12e99825171a4458f3afef07aff +size 730458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ebb48c1d21..b1f63168f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25d9672de8b03782385093909f8b8cbcb7e58e2f25a6b3f319b312dc042370d6 -size 670873 +oid sha256:64ff974b9be9ab47f64507f02013714a729f94008730b1f2f27805003ddf46fa +size 645746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 41d304cd63..3554487128 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7011484ba3711f824da473e56c878e6dc7dd7249019d5b31bcb236cedbe4d52 -size 776867 +oid sha256:0a5b0d07c855723e87e8e2760053d3fb8427a1675abd7090d54a50f63dd7980d +size 748732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 20675879c5..df4806d39a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbf447c202223f50b9ccf7ae6cfd60342d5fd1e6173fa8961ff74f014750d8ff -size 692499 +oid sha256:bde1255e2639141cde6f5344f6e6c3c4505ebe18b5ac23c349b97b5987a25265 +size 659332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bba8338570..7b52824d3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0798df997db617d55ba01c595ce4fa0e50ab90f21e053630094577b2cc84277d -size 767149 +oid sha256:9f913edbc4a87635c85b7a442b3b81bdd983aefbce986b196334f60e9d5c6ea2 +size 741578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4f12f069e1..116687708f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2594a29d30b93709ff6aba50566d79676ba17e7f6c8f93568a44cf4fd5c0e5e7 -size 682879 +oid sha256:524b744aa03ab5bedb4b4d4bf1a25b374eda5a8de4fbcb1a3e724a958f7adadc +size 652178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8126e252bd..e3b41fd439 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69d039602ed5c2b4cd6f388dd18bf8780f13b07226c9175a8d8a2eb8a0071796 -size 772081 +oid sha256:e01c671909c551077fd482b15679b71c7f821cfe539fc2912b3e7c5de868d923 +size 746608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a5d5ce20bc..7112f0c58f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0aa806058849daa53e127ec26804be5c837e825476a864bebe69b77ae7a0a1b -size 691067 +oid sha256:414ff338a83c250d8ac2290e085984246448345a76ef9a265fb7ed60d459eeff +size 659970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f153c95865..9977134b41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9affabad4a3f7bc25c2e530dbc63bc11bd1a91a0a82da74c2c0f100483d8350 -size 762115 +oid sha256:f9b6678f6afe83ebb1a89370a84827934e59783407edcc4faa2ed1ba28cd7b8b +size 739356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 58c0023abc..a1498fb755 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:764e198a934de34971cff3a1d5ddace5abf7cb3c845f24fdbe817a4e71f55daf -size 682237 +oid sha256:ab6a8ac751cade7d0ea18785c5a6409e1244a0d85f2093b022f304e441e14b0d +size 652818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d7ac244e24..0daea6e1f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21a38c8ed5360fd9db17e1a80e7b7be1451d95af7d1facf73c576d7f284003d5 -size 862345 +oid sha256:fba5adbb7a280b15b2841465385669f469ac473f992f2d53dbf2c01e231b5088 +size 816942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f09b49fb17..0b9ae52b7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc2b5353b46cb46a43f28ef8a61026cf0756ce9bfac914f5546c0d0f18edd869 -size 773537 +oid sha256:67bf395a7ffade9ec61a7e10fc6fffacada5207f987c642932a0f931000570e4 +size 728972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d56671f0ab..57fe2a1553 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1afec2a3ed994bb8a034f64780cbd22e25ebdaf2ecaca2be4b2a5f573caca734 -size 852577 +oid sha256:dfbe9bcf5f797eb86e3865d09002741e1d5d6fd0046c67a45b1a2c0de419b0c0 +size 810578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index bfa366c574..190cdd2edf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8649519a87c169822a1dbce3e18da0e787d9ab4bef8ad6f086bd8528f2eca366 -size 763917 +oid sha256:17e2b96c6cf6e4a74ad389b9a9ddb74e0c275411d3d792fb042e2496700a9e70 +size 721820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7b05da168f..7def2580c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:072a049e5ee3979cf6857f08d816f4aa7d5c46d4ba2ca75123390c35f61f8800 -size 654963 +oid sha256:ec805993bfe9ec077b8a940903d5dc9b8a67bd61569964ecfdd8be0b39027e5c +size 636890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3817e31738..76a020251f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d3659184493e5c7963e8fbbe8ca7f08e4e956ca9af565c90b113c509c754db1 -size 563836 +oid sha256:c311cbd403cc7d7784875f7a6c5c40aac38c8ba44195d233bafed6b9b84215f4 +size 547835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 96312c975a..b56e5d2470 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee898d48357f26f18a46fe52ab84dace2a3e6dfeb7c1a2bfe7dd467bd22a41a7 -size 656441 +oid sha256:bd5faff35a1eff7dbe811293ccdbae5b762ff389b2b9e2a3d448e5659e8cda0e +size 637432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 491ae1f985..20679c8e60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99bc236be727d59e9647cfc3cf6b0a1fc08649b94ba55b00eecfd38403e4b6aa -size 567138 +oid sha256:f4dd1b996be6a288f52db8ff5d3783f714b40db665c011ca545ca3584cbd04cc +size 549559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c7f99310b4..087e8de674 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9710986cb9ed8be91bb0b8e79214cd810a420c15e5e4ddcf342916d53a86093e -size 725591 +oid sha256:cb51e022bdb16d63642f35221c7d5402e71179bc2c104e73d78f9bc0b82b1397 +size 704806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index bc2c9efa3c..3ecff5a1fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71c8420ef97aa1d353aedf5c7cbffc737a37a58fc3f763d77c2c482a978f8323 -size 630469 +oid sha256:a556d870cbbfec440136afdb2d2954791bc138cee3e08b0bf676260821b8fdc7 +size 613529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e20bd87f44..d4fbec2d25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ad8b1fc68fe2f38ee415a14000e4dcde168587beb9fb8908cf6fc704cb922dc -size 756987 +oid sha256:0ad0939ae35d3601ec6a5761e0fb78d5a57845beea861be3af8b99b6b3f108c8 +size 725150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d5b2ce8a44..cf62f63c3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88e3c0fb89d925c39b2a5559b2818c9a2acc75b2dae4b3213a2ad2a9e2d3159c -size 663837 +oid sha256:bc02cc7680bf4fb3b48cb781b84fd04f8355f2e39bf781ce2f750920a5bd0752 +size 636540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f0299c1760..891aecc383 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3ebb3e825879701bd19edaf6a4f4f74a8adc4d8c7eb2e7441661bef96e29667 -size 736759 +oid sha256:fee75b1e37e0336ff8619f9ea1660653a65ee3c1c4a83845333825bd14277913 +size 711632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 703db510d9..3a8c85f885 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0778a6e8aeea8015b07d4aa63910650e3b4b93e3a4b1dbcdd72166fabd7c6f69 -size 645287 +oid sha256:19c51b831bf1f87db3d217d14fb49fc650cc1236cfabf1c2d52c1b90258a12b9 +size 623022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a7f878f621..dfc9ffbf4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89dc3871c923d5b1a5ea7df78ec1663d5cbb0798143ff03a893a86f05ed53bdc -size 760487 +oid sha256:5cf79deb6f8e3ca5b451f90c5cec6600b003aa5e594a4d511d2ec95f4119e5f2 +size 726578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 258d636cd8..7e72a20424 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9a79554b21cd67811bc4714b6bd1da4e56dde0c117b31e70cf8a5a1d67642bc -size 667733 +oid sha256:ff17dd04332ec40cbcc878f8682730bfda53e01fcdb397fe66c72a705fa29ed0 +size 640434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 59c18c441d..5c888e6753 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e75b69ba0e930a0bb9f651f59af766823688ccabf1e0cd605f2d309c320b5431 -size 740211 +oid sha256:9160d59a1af404aff3aa09cb554fbcba44b44e664bbbf799f55f6633bf21a634 +size 713062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index a43eaf5e88..f8aa69f1b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a730fc9db6dcbd64d3b7198da78a87f2909286fa23975f296c58ffbb8e27fa9 -size 649183 +oid sha256:2243f6946bacc13591058fa059bce584372ad2681a0b100248f7efad1a218f74 +size 626918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 608e5d4e34..794e3a1ec6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b8ee73faf818e7f773f2a2d6608a6d061c977300faee3ff36a5a4e1ea0a8c0e -size 844191 +oid sha256:e0df5ec7348a59bfc8eb3e8e1de6c997329e0ad85d8964ef2ea54c1416ce9967 +size 792276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7a97444b15..f5436485fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17b39968f8b2aa12905b3cc7cbf73f8161d38485da4740859f2c7bf85e3533eb -size 731949 +oid sha256:f4852e2145e3d994b9fb955502fec62d21c35b704983ee7ee465f059e3d9f00d +size 703912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6a330235d1..27195d779c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5c40da7b76a9c0fb9a37c7efd813d8e72d81ab614235526235449871bc40341 -size 823963 +oid sha256:1231ff7027d116275fec418abf1a0a481411177bf2fcb3eb9a04394820cb1c62 +size 778660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 00aacdc00e..da23f1201a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5af4eec961d88a781ae6224f4679ae57d20ecc7206c2b6a4dc9e17d9b15afa9a -size 713399 +oid sha256:1ec386bd927c1aa2a3287149dd10054a706e66d4b055d4dddef87acee4568a80 +size 690394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c823f815d5..50e0f51f92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb117e3c84735649a489e3d2c57777f439ba097830d494c278d1b289e361f98f -size 632071 +oid sha256:6f812762c5f70d5f836908943923ffb9da1150f0a8dcabda59406fbced273033 +size 602305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 81373591b1..8feba8fc7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:924fc2b32882fd2d6eb960975abd404daa12582a9f08fc8c27f826cdb516966d -size 544594 +oid sha256:4f427674987e73bb4e5ec8afa6a29c2e1b3efcf1cc520758ff2d2565912d343c +size 520355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 647e9bf8af..aa16d32d2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:619f551009aa0414ef9782622737ec7e87da4d1a1b4a1ea559deb934ff0970f7 -size 628911 +oid sha256:61fc067114f6990dbc8cfe8b7a30c7a61c93aa75dc13315ddabdecbf38dfa211 +size 601613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index c285f743bc..e6b2376f16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ece983e9727fcaef40e368ab27963a52afad0fda60b486d0a3db798a8401af9 -size 544592 +oid sha256:1596852bb687ab7c96ccfadf55bfeea3b58783ef3ba5ad1f4119ad8f29b0d5d1 +size 520797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 25c06ffda6..3cc63ceada 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dd21b86e72567e9f9374557be9ac90ad09d0751385a48b09e0cfc955973045d -size 698455 +oid sha256:4ea700565058d221896a8304468b0c8dcf085f89cf75e7d6e3346461f9121046 +size 669728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d4d58b2e0d..8f55a50432 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dd05a675320bc91531c92067a17352219e4274eb4730a0d8a0f5d2abbf458a3 -size 612014 +oid sha256:42a3d8c36ea2718c083353cf568b91dcae849ab29bd848560e0e0d51e9f56097 +size 585753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bd62b14532..e089ef004b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:307eb3767afb2a715dc66206d6d62503ff483c3976e5069a5baf4c0bf957c49d -size 721513 +oid sha256:47ef998e16de8a23d095715159321f1d2b1297a887bba257274adbb206e7ded2 +size 689430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3221d7d16d..196c6c3016 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69e3d3b1647c9c2cabb8cf4859b8a33b84f99b5e87c1e7b5bf3f2d3841fc8605 -size 629647 +oid sha256:7bc6a02ce0a663542c3265455446f79ebfcf6a3b86d622affc6e103369f477e4 +size 600227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9452e5274d..444f53407e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f2529d9f41e0e26a45cdd4557b9c41bcf707c255f967a3ec6f2036584106473 -size 710957 +oid sha256:3a2fa9854d26cbc1be77a7b7dc9c239442f0cfe9482214784a96aa98adf2a6b6 +size 682278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5bc2c45da6..f9533d8696 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19c5b6d086b65c26468eb87f84a508920644f328507213d1cb081d0627c3b3a3 -size 620027 +oid sha256:208c329b81b50915d81de738b497b235ac4cd3f7b27a904840a1d7c8bd829932 +size 593073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6ac91adb77..61e794a206 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:688de7350293f9f9d3d9a28ce145eb371dc972443b20555c8c803470691a3900 -size 716677 +oid sha256:2a37fb86d9da440de16f1f8ca9e0d5b28456a685144b8c9571af3c154e74469a +size 683756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4818fc5b75..0fd3eebf41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e747211e65850b19c5373f0bae8115116c9d8f6545988d20bd23268ff4535a0a -size 629251 +oid sha256:36c72278208260994436069d992ce5f85ab41dc5d1c35bab285724bab22b78b8 +size 600965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 458438ccaa..49fea4652a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10f91822d563ad1a20e0b4bb544e8661dd13d14ff7ba7116cfb506a37ea2707b -size 706761 +oid sha256:3a98c1b0d314f8a79b6248e0556fb9e75c29731f5524588b91cf1bd1ceae820a +size 676504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index dd4f0223b0..76cbdfd92a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec92ff07046451bea37f86a49bf6958e5561d13f2b57fad374e470f91d459fbe -size 620419 +oid sha256:15cf5ba449ae2fe058b5867f4620a99f6aadf6269f15da53d8722335c91dc1bb +size 593861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6dddb83193..824364fc7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32d51e1e288559482658ff1dcd11c3743a36337e3c171602fcd42c5768d2b385 -size 806153 +oid sha256:27c618df86cc5ff7e57204c3df7e958ee05e8b03e8cdd62e122b604a7e5ddcba +size 757642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ec41153587..863dec09e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d154da62e7efcc3d303d05bcd2d42dda09b45a55e6e56dac10fcdbc5dc8f2fc -size 713645 +oid sha256:e0eac7206f84683ae8175e5ce18271baff7aaff6b469e592f8372fcdebe2057d +size 667354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 003a5821b0..090cdfed34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3202109dce820bb1b19556cd076a81bbcd64626599f51f844836d4a6a17a68c7 -size 796433 +oid sha256:fdf7f29506e83bca0d2bca9276e7af1eb3d687b7fda89844c037641ce4a291ce +size 750488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ceb34e3814..2f2c06651f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec386f49ceeea11893c89c5a97af4b05be08b5bcc518dc00de6ba0ff8474ce8b -size 704025 +oid sha256:62bcb9c6d662c26fba576d2c3fdc008880deaeed4c5c9da78b06f0a6da00f300 +size 660200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a326be6cd9..ea864ad09c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e729333a705570e07ef63d3a8802723678e86b7da9177a4df6d8c98bc05b7ca -size 664533 +oid sha256:3bf30f3646bc67f4cf1fd89b26e232841ce344b6096b23a860378ff3d6f18fe4 +size 644390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 83ec867b4e..b4ac8ed1da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aedfde3d3b586c6f1c9d424a9a33cd0bbc6c10ff3d854a3ad8e9b73693787f32 -size 578684 +oid sha256:5fc33c10af0cc854c7e8d2de36f9b275b5305dc821ca0b542d67415ec33df631 +size 561057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8199f2acc6..8d02ae2ba0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c55da8e49540f9c815fd86ba269cd0d7448fb90ba78322cb690cc17c351e8b0 -size 661473 +oid sha256:4a48f2f61a3b147fa43423bf4d9bd103076808ef10d51e6b97ba47f6dafea5d3 +size 642118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 55fb44e218..45ca292ebb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36ac3f8b96c4631190c7e6f8f8903960be7fa8eb712a85819bf0f2ba693a2760 -size 578486 +oid sha256:7b506bbcb24fdc6e314a3f68c1029eafc783a7a2140713571e33aee616c2551b +size 561399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 871e562047..e2bec3426c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:355ad4a2a8f4c5e24fa8fbeef11f0a7272ddfb77c884f774eff431d05ef90bfe -size 730919 +oid sha256:7ae3fbec0b2aab55cf63c67e9752c22184dbbfbdafd10ee248457f094df47627 +size 711022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7e924b4dca..843e419c0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3152f696db973e451a363b80fb0a9c7709388b71001e5cb1ab4690c0b1deb2d9 -size 646057 +oid sha256:91b7b4e628d4db65dcf541c84e32a30b8faaa0b675ec97f651b7c3a386a3587a +size 628034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4f2ef1d46f..dcf0f0dfac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ae63ebcaeff0e3ce88deb259ce99cb1b1f61e1502f946c6544c1ee90de6c045 -size 752547 +oid sha256:4b4f0ec79d145dd1c7b4c84fff240836735bbff1fa6e6bbd2766e03e34e44f4e +size 729294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f6fa04b8cf..fd3e914980 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e0e7fde3bd3136e65964d434281988d3c6fd157d8e7439c5d704a18978219fa -size 667241 +oid sha256:dbf34edc9d6ee6edd4196983dcd6b970258810dceb6ea2ff26ce548a3517dcc5 +size 642608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 105f84907e..2812a40237 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9bfad736e4638b438140d2c5650b068463a24c561b93009f4c97a36f80551de -size 742827 +oid sha256:b389003faca0c4db009db39c625ebef2138f6d93773203ee15fa310eb720e4c9 +size 722140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 587e97316b..ee6d0ea994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6e39290ad6f5fdf27f2de5ef27519a8752e5fc5fdcb15e0d707110ff6cf53bd -size 657621 +oid sha256:c6aec88829a09df6ea08df88adcca8a96ba614dd80b869d15a1b5bb5de572b87 +size 635454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 449d81fa76..b781d12288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:877b67ad355ef56674bd3ac5362281c0441c1da950cb96c6d69f34f60eb216c2 -size 747709 +oid sha256:eb1ce1001f217d57224036a14d5dcf57e7160c57568894a04b5300b169e91cf1 +size 726282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6c879c92a1..41044a1bdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:836956ecbc5c22802f4bc8f2bd9bc1dee5fedaee2b88cdd979e98c1ff03f567b -size 666597 +oid sha256:247dad0dbb01f0d5354311e50123dabacf4812f278199fb030956aff9faa4d5f +size 642506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7b60d11998..8cd095e122 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b0e528d722c021faa4f9cf8bd327986a89db56b8c539a3441ab1dd69136cad3 -size 737793 +oid sha256:5bbdc6a600a98f8d4c8a0164c5d57b73d65ad47e6e2d429df7017ea1313d3e95 +size 719030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index aee9a31905..34c5a75ddb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7710790117e3388cfcca06c667a1a0ad8ec056d42210496620aeba7ad62a61a -size 656977 +oid sha256:1dd3f4b89ef35358db72bcee4f9f543888e0a1c2aac82acd42ee9331fa2a19ca +size 635354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4b023f4381..ac52f03cba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37861c7b5b0447cbeedd53601e4ff91b3f202534aff674b1ef917d61ec6c0672 -size 838763 +oid sha256:d744902bdcffc185d644bd78a0141c997da8b9f1a5a583c4763b87d36748a792 +size 798294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6ce2761e92..2404cc7b43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6069f3e80f27c17e85709dc704e52c0dd9705a756bdc372ed1edee55eac1fc85 -size 748723 +oid sha256:3674eb2731726f0accc25866208c8ab83db588ac481114f806145ae340e2f236 +size 707908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 11582b4d20..b41bb259f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e810b3910760c9d0dff898398f553cdeb53de994108889aaf649ff968776c35a -size 829045 +oid sha256:d508a44edda7d6b02fee9d9a4acfc6fd4daf9d5a266c16d23c54fcacf3623596 +size 791140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index caaba72932..be67426996 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:012e214dc0801fe38d6679fbd5fbd277d566eb5a2e23e045cd306434c09c5aa1 -size 739891 +oid sha256:993f23fd9f6c333c3c8bf267fc8d6e5784e4c66f1ac08782c7e74a8465b2d21f +size 700754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1e5a02b3ac..dc867151a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee405cb9b40ac992d2cd637c86b2f947f3b365a3396e218fb3fedeb50122babd -size 632467 +oid sha256:eaa58f5af0c72db0818f81af078d02eaa2607c2a7b7567508f5bcca085533506 +size 614443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index fda39898c6..622477c5c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f20b9fcbb6c23dc4beac5a1ecfa4d7a44da7dcc2f3ed110ab89984330f1d6c94 -size 543806 +oid sha256:89cdb14f500bf4ff093c5aef20cc59372afa15590706824299a25968b84af25a +size 527805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index bb987025f4..b903f9baa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adb80beb81f77631f9380de783e6d481a38225b3d77fe5b703c3553e8720f67f -size 633945 +oid sha256:dc3776212d61df6bdd12faf440495e9a59fe6077b6fa736c1a2d7e87f7c81abd +size 615033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b40859e819..42b2e4ae3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:399fd17663769398e1cbc5c295fe67b69ef763c526da813fcb7a2d25fde0db54 -size 547060 +oid sha256:7aa2c4a0a483cd9478ff98bd75a83981e76f713c7cad0645b805e6edd6cf05db +size 530319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2e58bfdcea..e1f1689e6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85e050f39a4241e095c4cb93f790634982604d116d8c61354b0398ad3db14897 -size 703095 +oid sha256:886a3b93b26728c5f37e005600577780ceca9c9bf50f3e97a0cc2c93addfc3a7 +size 682408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a48fa13439..f76cdd70cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f98453d1a9a2bc6b0b381079fe06b31a01448b9f6f96769af580b136fb0ab85f -size 610488 +oid sha256:4ff329dd2303c83c9b9690041ceddc91118a290227e9951b4c171dd2155e52b5 +size 594289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ca48f31bd1..111483e346 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7572cdaa1f259bdd828885c23aa27ba33f3074a4af94f411cf8d8d513125743f -size 728965 +oid sha256:22b69272f0f0cbb2b42a2321130c15b7db4449eea56867bf2212bbe897edecf6 +size 702752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0c1a79cef1..3ff08f8b22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d5f8d1048294e98e6272d170af94c258d6fe358957b5ed7b5ff9cd614b5b606 -size 644647 +oid sha256:a4944a998d94b4370b5794ae7d2f2ef209ce0cebb2f577710bf5b3505d24c6a2 +size 617347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e5f510c9b5..91019e1709 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4eece5e197de6f4bb0b5d1a70e3332675554764a0ec9ba95ec6cf431afbd8b9a -size 709527 +oid sha256:1511a51a126aa795d1eb302ba45b72178cf9d58157a615d391212ae631ef7131 +size 689236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ed656c0016..efcedde9fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c666028da4cd029fc41974865020a600e17fc8215370ae068f6c2300a24869f -size 625259 +oid sha256:4067a45c6c92d5b3fd64a5c9ae1cc6364fd84d7a8baa7437ca0dd111a9268792 +size 603879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f2507d585a..97029c4d12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb380d40a308be6a5ebefd6e43448a38337efbdafbb1b72ac985e0d9d973cb39 -size 737843 +oid sha256:47f17499ee78a45e4d6e3014e2aa80638764242158fdff38287bfb488431738f +size 704132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index ceb350d905..1fad1d6c26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b8b929f7bce3a4ffde82928b9963d391c3efb797dd3794253014616c4c437e6 -size 648443 +oid sha256:28df1d1010442e9dd9b73dd56d95bdcbdff60252f59bcd496567250062a04607 +size 620504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b03ca5ad61..b13c55e978 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bda7c9a2b3beb2991fa3a695cb766d0786e4571f5d5c03adab79ca0e6432ada8 -size 717617 +oid sha256:a0c6c877f9461f4ec04a9842e8ecf28c278680c7641779ab40baee43f148d2db +size 690614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index fd14ba0d0b..29365dbb37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3b2d26be0dd9b3670e0a4c8a9c69d06d9a627bf8d439496d8b02067266b0333 -size 629055 +oid sha256:721f13e6709d2959f9aa79e49a5467b3f79de9d8d96fed133fa918ec11e62ca2 +size 607775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4cbf536ef2..022158d90c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83e898c6557dbabde19276820f781bff165c4c53c9d566e427b00a5b175c90e5 -size 820807 +oid sha256:fd2ed040f43e40dba1d048033f1db7ee493bec573d99774636ca7afbd62abb20 +size 769828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6a76a2816e..7b445afd63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0f0845d75da523f7ee26d969628f3c1fa45bcc279d9d56afcb2d743470ebaff -size 712759 +oid sha256:2c0786288b80a75ef701568e0f116bd349fec7d6bfa2696865e0a6564925fcd4 +size 683980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2f286a850d..a8a5921ce7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9c0b7185638b66cd15c9f3dd1e22c99a7c2793813e4ec448299320988f078c7 -size 801319 +oid sha256:8f7fa9595cdb7ba65a3c8acb47a19e0c1e50561737d52fa2634892904604acb8 +size 756262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 7e9e7878f2..e273d59157 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:818e786019f947e28296df9ecf556dae2f1f2635fa4c1453621da9212b443e35 -size 694159 +oid sha256:324212fa5e811acfa13f62dabf5afc7277bccde9c8497a99f42ca88a6d4ce2b6 +size 670464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6fdae3f629..d259b65f6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a5396f6d427ea7c08928f6ef43f3c39932d2cac8880d8392ebedbd22dda039b -size 637645 +oid sha256:26aa671a07a5315e35951cb4ad364b01abd4147276db535699840f28461841e5 +size 611235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a08e511c8e..c3d776b2ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3aee18ed353a271c93504eb022355308d86eca27d96d835aa0b361c065ea2323 -size 550168 +oid sha256:918ed8334f5c3497204497576d274d37aa8efd038b6d5564ea8676eb48f9f08f +size 528495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 702656203f..725198286a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80b5690036098e1d1953f46d133b76700f6fdabf4c2c53eabedd37d9cdce6ed8 -size 634485 +oid sha256:93076feb69cb20c53006aff3f0e6b395466e8c5001c37c29df986da4f5db8aeb +size 609753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 123f0a844b..87cb8c5091 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb9bcb6ad4174be498f0f83decae9827d9a8a93b624247157853f769484e5a72 -size 550166 +oid sha256:1d1e9675454cf818f245193dae11cfc4122ef52d0e706be5c8d5b870a09a5465 +size 528937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1cb261bb17..94274a3855 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60adb00f534340529d55045b205cc9621677214476917832e2c043446b7beb0d -size 704031 +oid sha256:a519d3e4be79f007a0d88c1368a719cea4eac51cc8c874f212308e8d0b61a2ed +size 677868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 732a5dbc5e..24ac4f2130 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86cfcd24dd3b1ae83f87a926a7e058fc2abfb0a787ee0625df2e5118b178ebe8 -size 617590 +oid sha256:05ba1b57e63d443ed45c575d33d2ce73bbf73700051473b973f8b4d4a280c2d0 +size 594683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 190be10c53..2f8e73b5ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bc09927ad4a76202109de07a8650236aff55759b23ae08746066a99b818c166 -size 726299 +oid sha256:f49be4ceb30ab0604d8fef1e2fbfa4f4c2b8c0cea48d1371df72856a841f9d2e +size 697570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 80f5124da9..1a9aa5d4ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a82b7d846fc564f1389db98f485fc4cdfe5831f2b7517aebcbd2c2394b7d4357 -size 635221 +oid sha256:85349461d5ef4059edaf7be846c925b17529988da79c891e17a60b674f405329 +size 609157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index aa78bfa6dd..92b912ab8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92f56648d3b7debe0d9ddb63042eb37098c0c31833b7cf44dec7cdb95cca7618 -size 716531 +oid sha256:7895b52ebba1030f6657c828ec3b035037e050d9af5b2325812efcb4fbbb1696 +size 690418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index f8bfde22d4..29b40abad2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a279e3efd863b5b5b81730962d861dad9a11b3ae45e5ec06ea942ea22151146d -size 625601 +oid sha256:813b0737761824fe3f1ab4c2712942e9a5f66a725d0d2b0c3e8743f33deaccf0 +size 602003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index be0440589f..5c9d68378d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c08e505ae840e375b0a3abb54478466e71adbac319ec9b5c5cfcefd24abb5d8e -size 722251 +oid sha256:4658d2b365a295213f172328b176b69095b5c487abf475e4afb8870fa5524846 +size 691896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b77a147745..f3b9010d19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fd5fb5ff2671e79322c469228f38594a7806be8b224717bf8d4f827835769b3 -size 634825 +oid sha256:07a8849b807b2f3eca8f2e391eefae67f82aeae1043ded4678ff08962eb6530f +size 609943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ab1a26915f..649606ede1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2d468c3ad18084e628ae085255fabde2385509c5dfa3f2c58128e27fd1b43e0 -size 712335 +oid sha256:05363cd6736605a473f70accbbc6cd2da098072509d55d716f149cb6853b098b +size 685432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index eec88b62a5..e734a273e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a58e5b6f26c00b1944876b03021422ed69deb8d5afd66caa83912568600c3f95 -size 625205 +oid sha256:e1a78230f8325e2170b468c046b3ae9055205283bc6ec24b38ccff17294b3ca4 +size 602791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index cf8f39845d..9425b83e74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dca35aeb6d46c3f1f7aad8d2b14e4a3831c55070af3274162334fd42338b972d -size 811727 +oid sha256:b6df1dff63c5dd7fbed256278e9102d54419c9fff59c0a060f7e5f7d9f4d4690 +size 765782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 58a943efb4..131c92311a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baea9720c003add7939ce4e900ae043faa7fb8d8facd5352551251c86a208054 -size 719219 +oid sha256:01fc1d6c732e0b38361c70f4cc26b0742fd29a0365a476263cd42c1741deced7 +size 676282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 79022bfabc..a959dbb9e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4968ce6337a67353233b58bc4bc70c36db7254817c6c0d4cb9eab29493c27dfe -size 802009 +oid sha256:5196075850dcf1c9c13988c785bfd55e5936569d2efe61198cbcf0bcd6112ab1 +size 758628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8bbeb0d3e2..3c0588e924 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:530cba8ae53a9aa5d5c75e7b3fe15211c7df6cdf094207a77330665265b5ccf9 -size 709599 +oid sha256:a743a622728f8ac235b781c33b7f192adcfcb58a25efa4fb2f1eed4106e6b233 +size 669130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d03784829b..44f2ffaae7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d0b24995a622d48acda2ae1c204bdffd0db1ef1010cd1b85440a78399b4943a -size 703063 +oid sha256:a848177bbe23bd20a763499055ba558d49012ad47246b42d7b5623f22097c3bf +size 705810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b9dc32093a..df14c8fc9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00e7de33db424f89abbcc98f79974c33121717c8156ad960e30b6ff3d33885b1 -size 618497 +oid sha256:e2ff2c22c5a2a9ac6134267eb240cccc51e2f87aeeef3d967dfbfe77cec5fdd0 +size 618334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a80e052f49..53319a3768 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4edc3c998023d396ce01296fcfac3b854b739e6cb0a6da83f8d1ada487af1fce -size 699213 +oid sha256:01f61d6df3f47254706cfbdc58518cc7ac572018a25afb8dfbfbe92acdba3c3f +size 703094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 793d3ebdd5..a23bd0e3fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1d3ed306ccb6136e0f7da44703e93c46ede2fc3c2e8faeb6388d74c7129baef -size 617310 +oid sha256:fb1cdac125b0251f06d15941a5bd334e9ce4048317b62a1b39bea0d7ee0876c0 +size 619220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 925d59156f..2ce95c998b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2246c0ad795b40261cf06194d71a2deeb0bf0aeebf2fc12d511f7c7e2578210 -size 770287 +oid sha256:f3751bbc5fd4e72052e1b35517f7092d9671d16f071fc82accc71fa7379761ec +size 772046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e1eaf8d14b..d803a56cc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61d748fa8fe3f173ebc982d390f8dca82e2dbbb6d9b6a0444eabf8a9a3f88f3a -size 686807 +oid sha256:eb445c7459f2ec5f458da0ca039cb546a60d60eddeff050fbb9173bf9985c086 +size 685952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3f474a0f43..a1e0b741ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b26392bda0b602065b3b1bc8207f02e367af4323a6f7ceb887307d8ef0387597 -size 812387 +oid sha256:7cefecf2a6d865a7024f77927ab4847258c736d13605cd1eb790f680776326b2 +size 806796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 710ed9bad0..9abaf9135a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8516a983fe31458d97eb5c5b34506ce79ab307a2d0f238841ae4bcde30529a6 -size 720965 +oid sha256:9c5b66d2536912a51eabce8ee0680c230ea08392967511140722f2b0fa90f3d5 +size 712414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2a4bead335..e4729c85ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42ef10230e4df1915376efeac8e7b826f12c599d539303a126dc1ce037b86bc2 -size 790829 +oid sha256:25c4117264dd9d9fd92a6c2dca76ea885626ecf90cc1bb45e07b525827d1868c +size 790468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e80d966467..ded3671217 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad714a5398a72b2ae304f013bee4156b31414bbfc2c464b4bd3538fe8253f988 -size 698025 +oid sha256:31b0c265f0329943e95f86bf7d2675273626a10e3eb248441295c1e38c064304 +size 695444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5431a18df1..9a6ae339a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b55a7de045d84e9a6eb6c802858c54f6e25c1b4c82ae9e7692cfc5d010c5129c -size 804591 +oid sha256:3cb81d221aab6bb6639d61dc497d878b724f5366bf010dc0002dc53375a9f337 +size 802502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6956537489..94b0b1f861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f280cce78dcb181cf7b5920293b66066025e376f37dba24f22b0f95749c520d8 -size 720075 +oid sha256:ba6d4312e367a165503b53e8adcc4d344312cf0c895dc0a69e980697ba5158e1 +size 713892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bdfc9a51d9..caded019d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b335038fdbe0936489307bd76dd67fc7aba3c024ea090d180b1619ef71e01d3 -size 785105 +oid sha256:8e07848bd7d60f5ac3e1cbc0be27fc78e76bb07bb2ece3082a0ce9ca28d2916a +size 785038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 20d394a667..5fe508045b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01313bf0f994feb533b2d38fb0b53a799f70fcb10b5e3bbf6d95205f118aa032 -size 698221 +oid sha256:9ebe5263c908ee039a8607a8278ba59123de7e8bb1fc4da670be2ce6665389e2 +size 697118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eaa6dfff41..b46a951320 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6706117c8f932fd11acfaeab361575eff6d68efc24554da594985b2c436dc537 -size 882719 +oid sha256:a0893d2c7471cfb204f444dee55c98e8867e033927ffdb52afbe71a033f05c59 +size 876192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a188697ec9..dc21d77672 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acc90bb1189f21c0525302fd642a726922096aaac465bcea72b117ffa9932f31 -size 789619 +oid sha256:152b95bce74afb44a006b79a821940f37f675c78ad547625738abb8a442214fa +size 779934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7448774f45..6e9f4fb6f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c24d1cd8725c2a1379679bf943672f50fdc9a1cfd401356500d23022b20b267 -size 861161 +oid sha256:8a8419ae6287e8c9820c0895defbca35203140d4dd9e65d0f20fc402edad9b8d +size 859912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c576565a7e..ff68e9252c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df31f5fda81c26c863c9159adcde1b75214c11dd125d442eaec23f6715e1b944 -size 766729 +oid sha256:addaac15fde4cf684e762c4f4bff82022db392aa9ec5bd4aa10fbc4fb7fd2340 +size 763160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6a84525dca..a6734ab561 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe09f968f8c9407e7c8c11880fdb256987f5e240b471e7adb7ebaa8d669aa8f2 -size 665569 +oid sha256:acb93537a77bf20ef35fca3c434cfb3b3f08ccc73fb804df435478eb4d1905bc +size 642218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index bdd9546444..03fed5c072 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b26030c698b0189dea2f4e03ef83bde8e1d1624c7030f745255c3cd68b725a55 -size 562996 +oid sha256:255bcb35d61abb85bc53cc3b7418ab9d7cfcf633490b405b1ac35bf2bae5a8c5 +size 545023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 78257e9afd..2ac91c4c55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:caeccdae232a0476a051e74cb5efb1bd5237e66974b3724a0ff7b8ad5175e02c -size 662953 +oid sha256:c9a87641cb152e7a26165e60b08694ea089dfb14bbb5062e1f70e1fca21626c1 +size 640638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 238e7f65b1..3f00a59462 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d34b524bb079d4ff9226bfaf4db2855e20f7dd8089ba8a48ed6c389d8f0d705 -size 581642 +oid sha256:e0b8b958e634940f863efb875a7cf048a13c12efe2031c99460fbae8262099e9 +size 560659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d086824bf5..f399120ac6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec58c3c81ec8291c6ad9f9611132709d4b09190bcbcf99733bfe922fd4f14a6f -size 731313 +oid sha256:114ee0e391f696a5fe452027494d8eab562779ff11ec9ae1623ab1f8d597e480 +size 709394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2e1228ef90..a1cd5ab262 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e81932f7b2d053ca1d999c955444be83aa489e2aa4885928bbe3d2dbdfada04 -size 628841 +oid sha256:7b06c048933ebfb783ef2ef3d679bbafdf4aae202c6b3dd13d51f59e3ddcb75c +size 613333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8722932e33..44ef804373 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7f1e047134e575045b6ef10736fe5c317082e6877a247100159aeaca3f3ca13 -size 808737 +oid sha256:5d63cc6eb519522a42262d383730df97370f88a8d749940e5fea132fd3c0184e +size 759338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2b665ebb06..0d54921316 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39023d35d608463e0481f3d8ea3010945a0c7cd08f7c5df71a0ad22b5a52e48c -size 687517 +oid sha256:ae28c0b991b82065cbee1739d6d472ef124adbc251469eaf1da4da0bbbcb51c5 +size 653608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8086b4b0fa..94bca117e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fda78177b2e27237db16627a1a920391a2bcfa1e072205f8b0bf7d64895dbb68 -size 755211 +oid sha256:a6491a0f20d14c95602301375ec0afc031832f2b3afe4dfea1948b2c4c7703b9 +size 731564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 31be8eebf1..1ce2686b4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7221a7050080ca611218301634b8de7125de5776b4f33846ecb89f161f4810a -size 649975 +oid sha256:b0f7dee011959615806d7c2796865855158f9d7f97d10e77c8063875fdfb96d2 +size 626624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 329a925acf..c117735e0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b984bc706f101e5877e20a712da48fe6dd08e78526898e315c11b6ae747a638b -size 811497 +oid sha256:1df05f3b0f5ef9c9403f8d6e4f46f7eec43af689dce0afa256b7f78134a94f94 +size 753466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f5b6ddefca..5b85dff940 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b94d87325129381d2b1d95e8c7c27c60d76d8d1fc07c3b2554fc64d3959b0c46 -size 716967 +oid sha256:7079a073f6b7cdc4bea6933a94cdb1ab04c480c871ac82a13d4a34c31ef594d2 +size 671466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a2c0025688..272062574a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a21484be7adfb680e50132e75d22a98bd6d18fe689fbfd0dec389586fdb09a77 -size 756393 +oid sha256:e897032270ed5fe587c26c72b3035c8c7596343b637f4c6a447ae247c055c68b +size 725690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1ef1039c39..bcd50c0fea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31571581b4af84e151c1a4bc774e7fe5b0d1b1a495bf5383c3d24131d00ba4dc -size 672617 +oid sha256:8beaa85973fcdee6d20df0cff23044d6fbd6d509eda02cb6884a0e02f4401274 +size 643690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3e21f176cf..1cd5cc62a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e17a9f23ba2a5d43dd4afa9908e70564a52ff5cff9e3f0142f7bc50c985a0bbe -size 880845 +oid sha256:cc49825ce84591a684c0682212e2a7977099c34ebac0e69d99fba9f010a9cd23 +size 824342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 755dfb5425..9c70f33b07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48fbe4b399027792f213339b6167f396959cd070748456f9649b77f66fe1b6fd -size 755481 +oid sha256:88342e794bb6d305c92116eb8f39f9dc0c6808a2782761113e87d58b7b101dcc +size 721770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 662ca8550c..a05d8c60b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03f56519ac744927050c89dc8a08fbaf0fe327946a496c3adf0f39546350675d -size 827615 +oid sha256:3572ce129fb69350094ca802859f2f02aaa480cb66d6a368be7ea05cd3b8619b +size 796568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3b076906be..53b5d1045b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e672e31555a7995af23b5072588f9f5b0dabc969e64d480d4823b3fa3ac4d0f5 -size 717939 +oid sha256:fc1d609e4a079b35076622478e3ed1547591194d117c5013694670b4d7febe3d +size 694784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 783a7a813e..00de71f807 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e670350d8d0c3a769b6385b11c1f692b90ba9bb1d0251a9b4ccd84a2c4176b4 -size 676915 +oid sha256:41c75f28c8d0593e346a16e815c95595d2ca87c8428525b8deee16398874d921 +size 643598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3c471412e1..f0b8e2be52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:488071cd7d68b6a3c5ddb62cdb48dda69482656aa06952bfacfce87e01b42fed -size 586576 +oid sha256:8684c46fa2e7f35912ad176a0d1a1da623d9d1471c6361db7c31fd904ce54b4f +size 559871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 5920f96d03..0e7a78a4da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6b99b3b48152e2bab362e16b758474836c2d9f2c025f20dcb81282f146131a1 -size 671683 +oid sha256:8ef1266bbbf1a70d322eb678f329919bc1de86094cea2798dfe357982cb79c3c +size 641030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 8bdbb606cc..bdb0a629fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c1c33ce67ac1a89700de755b5cdd5bfea4f0236e61a733d4d592ff2c9c25c54 -size 584700 +oid sha256:541ed9540c30647f71441a397ca91c42cc5e6add044ca72d708de8d4febd4160 +size 559573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6a7429029a..f991497665 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f5481becfd5f424ffe20b3d29f162a9dc46c776c552470132e7ce83bed1f764 -size 743299 +oid sha256:b5ca7f092869569eb7100c42569d027cd33d6d6a6119b9c10f40575d3e6e854d +size 709884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 372499790b..2be9263325 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:898a1819136ade3b7a06475a68618e34bb41b0e9f7a139d151943f785e2093a2 -size 654443 +oid sha256:1a757c3d97bd9b3c2a0497e507d8830663da97fddd10040204b61c188550f76c +size 627984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 15bf1216ba..a2ca4c0001 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1df5a1393dcd7cfb1bbd33daf52a8f84a89e8bc52f761fc5e1c3108789ea790d -size 768529 +oid sha256:e5f724afcdd7a7b0a18c2addaec29cd3f3b89011434333fd060e2e5c09a35203 +size 734274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c9691289ae..31a988c12d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c396c26c1a0bb73a31af4338da4205e61d2f2a96453a7cd15c0885f24a4d91bb -size 673701 +oid sha256:51d43efc43cf20053d083c52b9bb27b9c991c3aa823234c94ddc6832666dfc5e +size 639892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 32636c5a4f..e57a23cc88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:635aa7e17b86fd1590122e7915c6d4050324dfa220f4899116fd7d7a6b171ce0 -size 758809 +oid sha256:8efa7aff1cf5f047e437f0acc0173daaf847482217527db31d1e2829bf8dc0be +size 727122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5fb9d63cd3..a82029d78f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3b76df27424e0c03d01df7e94a769bacd98eb13e16ffd565bec98fc7c819283 -size 664871 +oid sha256:b28a96c1254a276d948c3d35b464b04e94bb044667a681e9cf6530265ce755ad +size 632738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5cf4d32563..8b1e553b33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e09cfd17052d8f4ce52a2e9413d4ae1cc5800058056bc1e386fde1198ff2c42 -size 762507 +oid sha256:88f7655082baaa1abfcab23ab8a68bd12843e5c4c1b54416a658a1e29bc09e0c +size 723716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 476b93968c..ae7d0d2887 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a0b912f61dc6931e117ecb3152401304039af7812f88593a88eae5e4ed051d3 -size 671973 +oid sha256:523424607ecc413682d6b85b95135ab87414eec720cbe817871825420e89e67d +size 640680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a917eaa3bb..851f65de61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8693801145c30df29cce76253dce6d015c941a667052b4c1405cb4f9f97b4726 -size 752691 +oid sha256:d19e1e9b5750f63263ce9224ea2c0228fa23549661ee3e3b176a69d016904f10 +size 716660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9a5dfdf99b..751ef042c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:821f069de6ed274c590b830baa2f060ffcda555f23b61a04092b1c100b56384e -size 663143 +oid sha256:f2c2024345831b3d46266d54fc0ecf00bb0c5c2e6f3cac1a843475bb2a132586 +size 633526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d285b0bdea..3f9cb3c6d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9931240eab11cb97c60149c2f137b81cac82cf7d7eb8b4f8dfc4a5d905c972f -size 838367 +oid sha256:0320e0119c41d87cb49a17a7e0a5e0daf33a2e173383b52c143ead13287a58ad +size 803226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index dd0cedf423..b6342148d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5844e29a8e51ae7b3959460a289fad7b3939f3a7dd6395d6372db47c1816d747 -size 741715 +oid sha256:ab95455e68ee202c48d8fe041127260635a74091ac7e05f853e927b0111c304c +size 709582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 390373eac8..6cacc8dcef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b2b3bb7745157cbcb664f398601d5fcec7d891cfc536fa95b61a856761181fe -size 829437 +oid sha256:14e9bf8952c75e1532c5c29d8daa45e59b3cec56805f9086f7bc3fa19500a3f2 +size 796072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d75f7bede0..da5f4c7a3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9146823bf8f2afe995985072dffe22d10e77ce1c7b70f9ac5c823e56379ba1af -size 732095 +oid sha256:00e17bc760c85c51084419290018019f7d0430c5a660695694bf995984200820 +size 702478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f0be43f83f..7bf203b4de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3a0ddff4f5198a60c6d2e885c07f274e207dcbadf309e830c529162815dc540 -size 830739 +oid sha256:86eaea9f106967ee868c56e520c2c1dd2a20fd1fa79fc8fff613696000729600 +size 785286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e0433cd73d..047400e3d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcdc93e66d1f6113a1aa5f934095d37b1d01b7e215c88d4e1a007c133df98f50 -size 809125 +oid sha256:0167f4a6bcc9ecf6aa8235e82cba03517470e18f524e806aee54dabcc1b962e0 +size 786020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 333e51eda5..3f70e0d533 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fa714fbb934cf896a30d67954233c9c849798de545ba75777b313f471aaa869 -size 797419 +oid sha256:89f62866ad5cee9b2a8f99481fdda76eae1529b381957cdfd8d6fab1b0296ca1 +size 762080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 61413cfe60..8d0abb9508 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5c5b03af73b3c21faa01e30f0a707217e7b0d7a6823880eba46b5b668bf1933 -size 690407 +oid sha256:f6b1a5c89eb63e13a293350bd43df5f56115571c05ab01a51f2f8cd6415b0c40 +size 672876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2e6a6a3aa3..c4a636ffbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2453f6dd198dbf26dd952203327ad4de6eb5bbc4f73639cd11c732e41d585927 -size 808391 +oid sha256:31d030f5d837f6eb72b7df9758131609d7be0adb028fa12a6b6ca8355581c85d +size 771028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index de1219f7f8..10cb34d82b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bf5bfc1c0b8e90213a56562fa1518c6ee404f710dddcbb7efac844132187044 -size 788749 +oid sha256:8aabead725354809df5742ad150c9c4681c3f27a32c52c43b2e40641ddb4cd15 +size 771714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7cb802fc5c..b7f6145e89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb5d36a230dc69a3ab0a6c76ffb9a42974d556666e129a4a11da4c332bdf4b5f -size 775367 +oid sha256:fc7bc47cbbbed7fee26c7cd3114d606df5ff62b2e597426c6e6c9a101e482c28 +size 741556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index aa10b38aaa..7e03e30244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ceeccffcdc504554bbaa17e47bbf8d0e628a56df29e552bd95baed66938cd6c -size 668059 +oid sha256:a16360d8a435e2a17a24df1f78eb590e4be0ef25c3ca71e64a27648578b111dc +size 658570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index be1a1b90ed..b5d03ffe13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04d7f4d31c1c7b937e4848bee26e9632ea501a184b2d91490cf010528e02c057 -size 720129 +oid sha256:cf1cc3d79cb2160c001bb2d80017477bc51bb1d7917a4728314f04451830558e +size 665748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4fe9faf552..ea095aa05a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02c94bb1404549f7050955238ef255df0a8d806a19cb73ec5b0b05cf34de14e7 -size 670793 +oid sha256:e87f01e2f8470a1d42541b7e0f6a4a0cb4fefba17550fa1646974c9ae7d45b43 +size 622086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ecc6a0ec68..13eafabcb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e3d01d18ce9824db3105c29c92d74e2651e71ca95cbdcb488043dbca6c9cae0 -size 707937 +oid sha256:5aaff2807e497017ec8c933591766006f862ab281d45b1e1d82f1ee95a09ff37 +size 653902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dc04f927d2..2f351b9956 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01a2b723e4f5b05571081bf44169609af6f86ce2bfeae993b7782ad7db8b4d1c -size 663437 +oid sha256:f3dcb81941b6b2f80a266bc75987f6d5d6acb113746f4e0ea24a5a4a033dd5a2 +size 616651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 62b144bac9..03be09e3c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9a59d45ed36bfad13d1c713a9a3d2682a34aff4bbc4879835603c7637720ae9 -size 712463 +oid sha256:28a67b665dcefc6295a8247de76d593be8938e09dca9402995d5ff386185d89e +size 659512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7e797dc85c..a1ee5d00bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd0bef70c583f00b093097f7e9e0bc772acbbe9bdcf3c6421c4bd9e413d57fe6 -size 609050 +oid sha256:dc9a7431410c1c789badf20cf7cb3d32976e32abd74559212c0c453ce971bf94 +size 566263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 425bd0da5d..33ae537183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1e8749f75a7fe91093e2b23cfd4ca1c6bea3112e19ded2de58ba214190a3c9c -size 648969 +oid sha256:79f8dbc74a72c9d1d153f79ce264464e4706211ce82b8972e770a797f2dbacf8 +size 612641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e0985d3113..341456fbf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aa92ef22353b08f9c40af6423a9eae8b34b54c6e141d109a2be26024537047a -size 572592 +oid sha256:06a5aa5967ed3c0df34f04878e7f7bd2ab3bcc6729613567ca0076c744d59b5e +size 532023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9180c386c4..40b74902a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ce56060c1acc86f80a15b1b88f801c26f0660969000de1c13dc5341ad269fb4 -size 696301 +oid sha256:1aba166b16167c4dd72a2e730ca86137ea665f05505e7c7f5f0850aceafb52c9 +size 651490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 471635c664..fd331654d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1232e1c836b70495f12a6e042e019259e1ea970bd99a8147d3f26a6f42550a43 -size 646965 +oid sha256:2949fb96262e829cdccd4f528400f27d6ac78819892a0fc16f5753a5c422c80f +size 608567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f07da102cb..c2ae61e122 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e6db23c25ab12fdafcf24e0b9fa4558a7b4ffcff84cc16db729b29e5cc11a3a -size 685985 +oid sha256:4a86777d519b749cd14d753d6142539566dbec888dc68aedceaadd2f724f4995 +size 639594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 316116de42..4358be7959 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c18cde0fefca0cae7f989a84725dd1a302a7d6b7112c318c6872b962f4679c0 -size 641533 +oid sha256:2aebbc52cfbe9393a0a0497e0ab40e1ba824cadc469e27c86e5827db622a6766 +size 602345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 968149c06e..a998acefbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:925aec9a4f3900cfcd305c0be3ef7425e7c1937f2f51c34ecaa53e662834f820 -size 688535 +oid sha256:4505d08d8e25fb982a94ede17b932c048597982c34d20eb04b9e75ed50865ccd +size 643626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1509e15842..a8c2de669a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:066af1890d2f1c9051e3a4f394d93634131fddbd0ef0ff8fd7b1cb0acba40ffc -size 587738 +oid sha256:2d9432af54f73e024d0038d4a32b4330542492cbd72745c5ebbddd0beefcfc70 +size 551955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dff14f3771..b01a617b9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccde9873509a916ff539272c519606885891008bc9e45183ea7ba3b976605394 -size 629531 +oid sha256:7f5c3b67024539c289d2af736c0b416b5208a23cf04efd1400f23ae6a71516aa +size 598237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 06d4546f1b..4f3ea19ed6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecfc83d1b936f7fffd6ec489e14c9d597e4fe4ca35e635de895a18617f7e2e38 -size 551378 +oid sha256:dde006a087798983bb00fb46a6c1326b83174753e953fe6b32014e15d2e54584 +size 517717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f3e0532a22..b19895b7ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a87f0313e152783ecbf564ee8deaa068831c1ca3ed71b673aa355c112962bf11 -size 737725 +oid sha256:f1be9269160adfaff672cf08c1fd05ef841e290d7bd0a9aed3454aaa77bc3248 +size 685564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1df8406a8b..cc228ff330 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0172182d582bc23d86273e35cc6250f5b95835ee4f19ea440fbeb980ea573b1 -size 688391 +oid sha256:d91a134706cdfd75fa976052a934607f9f96ac272f43f361d46f60259ae27785 +size 628040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ecb6a7160c..915ab6488c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:578ca0933db3bd34cdf34ea682c5485d4d397ba99d0b068549fb16878a4459e9 -size 725535 +oid sha256:52a1d3fc504e1edd20bd96e7050ad6d500036f1dc9e7d213709379e45f9d62c0 +size 674458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1abb3e6562..c0e30f67d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41cb43265d3fd0f93fa50f726d4196cfc9545bcff142c28731d99ccd470ff9b2 -size 680245 +oid sha256:2d251cb9e1ca6f7e3edca148bbfd0754000b4d262f8a2ed4d8ce59c8da790abc +size 635532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1dee745481..9fb25bc8b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3236ca0a366fb502b4a1f0e1b15d18735ed5366ebdc1334b12576e571b316f73 -size 731293 +oid sha256:8772bfdb560f5c676c55447b35325e7739f642aebea47b66e62297ef8d8efd68 +size 677306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 04548a5b2b..ecedb170f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68fe61c6522692d991fd44ca4c7aced8e37090940a7c555700940778e6bb0628 -size 609824 +oid sha256:f499cb90aea721ef411a5eb947db8d4c30b8172b261f3a78adb4b810f06915da +size 571477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 64d3c7a07e..3b16ab8acb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f12491bb7366c88eecd5eb9f33f2b82a14433d56807a80fe198234b451190ba -size 689603 +oid sha256:b4cc60c64ff76240607f48bb59103e2539465d1252a17c9d1933fd3a017ed972 +size 638972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4b4f547cef..e3e15bd0cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9e6fa38564b2ff6de3244947430f236ef65653fd30f78e1ac657e7b50cd3593 -size 572034 +oid sha256:3a6aabe902bb2014636ec5046296ed5d41956be080e791d6673a2246ec64768c +size 536595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0220f525cf..02b76d4a0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cc959158d2a72aed3cfa879e983ac967c906be1045ce12210d5d2c568d1b9b7 -size 713897 +oid sha256:7396ca45bc13eee2fc2ee9c022844046a1da6f6c1bf28e7fe639b65c4b9dea20 +size 671258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6313457756..bc93064c78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13de13d1a59921a3a748cfdd29a0cb0a5a27594502d52451b04d3b432d61c882 -size 664563 +oid sha256:68450364e4ab5bd098a8b24dc95eba903c7be55fa6f7e5d9ee8af77a6ac08614 +size 614521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eef89ca94c..17854dc6b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fbc701754953618198d6ab34f63152b11a5d867ddb643a9e5265df0115fa89d -size 704371 +oid sha256:2177f4d8738076fbc18fe50e985479b2ac24205d4b544c9543e6c287d4194d1e +size 660200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d86f69b931..7e4f7e3f61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49691c424ee4184a4b058e3d647dd381805b946146024d7f8c6c378ec702cbca -size 658341 +oid sha256:40c50812039022b256e6902b10b869042d25d52ee2a699ea43328e47b0fecc6d +size 621226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c1755bbc56..1ea063278c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:826f0433392b416a6d870f855a6144febfdc6958a0da2238ead0c47f84591572 -size 706477 +oid sha256:8bdb37404b3b8984730e16d133caede93fc96454fe254091e0b621a23f039b18 +size 662210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 67ef5a5ddb..0f3a50e7d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1eebbe8dcf3f34a6c04481a94ad2eb99d2260d231f6a920d56679dba2e1cba1a -size 590486 +oid sha256:b5fb2f05c4b642b436734bd80ee2f9d801b25c5519484cb1866f82a1a45b444b +size 557169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9a24b5217e..55a8a81f3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f62822efb3c52b61fd801409dab10ea5989b5c5f7211e8d52546de34cf2a2293 -size 664789 +oid sha256:1a4640aa7763b3a4b2f07903670df8f963868dffb4db75283d798f9d45bcb229 +size 623776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ceffeb5074..31e456876b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0934f50559a9e80186b166b89571413a6874e02564513a436c9cd5c4a031c04c -size 552646 +oid sha256:9f50cd23aba191ab38ff56d57541dc4c757a31109c490053531fda06c47a1079 +size 522289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 193fd0a4e0..de87d7d6d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfecc285df23382946e5ac1dfe62468ecc774d6bb86e0adf43b9576e5ede1e19 -size 888257 +oid sha256:77ec2ad76cc635e7a2ed747e77aebaf0b90f82a8125ed84946c0e77a2706da0e +size 804720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba452ac4ef..3cadb00d00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dfe04cd031f67a7f8ca274ff22c3b0438bf08f8816b82534fa055150b8dece5 -size 844793 +oid sha256:c584df4859386976cbf41265bf3bfcb3b31f226d253b5efde665a86a073c6d1b +size 765102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2402a48140..fbb3e34b67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6362f24b36c33b718dd12934bf23de4cfc376bed4bab8898df963eaaa1a7f26a -size 872217 +oid sha256:2d70cce071754f546c4d9e4fbad35b62b4092719d0c7a3649f7b88bfb4da0625 +size 787150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b6b1c51b44..6e73cf05ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4916ef2c6253e0c93c4e52782811e8a908ccf7d77077d2f329457a9a7a40b834 -size 834871 +oid sha256:9c061b736cd4762cfca19ed1da0ed647cd28e2b9321d7c156f58fb166a20de77 +size 754786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 978d36e08a..5d392f140a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6f4761a96182e749b9f35e07d3073eb536a1a799de05656cbebfa2c21420bf4 -size 862781 +oid sha256:ec714a6d4461ca47a60f273e7c89afd544e3a49fdee901e09798d15f32d1dcd0 +size 775148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5c094299a7..c43dc88bff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e002655b3e8bea12868aff9a1e696d36e6e95cbefb9c87acee03b93714cff43 -size 775355 +oid sha256:22074ad00de04bdb9ff19b699514a69f6031df63f62f3c08b80dbcb48c379c78 +size 700844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index af3598132e..b249bf6676 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bc2a1441b8d9a5828564258a9f1a27d179e70a6cf2de9bd94ca8eb00c20daf8 -size 794501 +oid sha256:d5dad718647c4259168940b9510ea1f618a82dad908098f9698947f8263f21cb +size 724038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0fe925e20d..a7e89e7f20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e67d3bd9c5c2afd295de4ee39d3d69918dbb2a79b3259e909898e452811b1d2b -size 736577 +oid sha256:cc7817336d57ae7023560628f2cc4f8d04a82b9c98241b19af71e270a9cee303 +size 664632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 64ad58024f..750e15883c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da682dc55e66c639e7455d622059b04e5d8ea7b9247a30903b034d1c0df85016 -size 841983 +oid sha256:0a315d61beaa73e5f6ea4ecf18b654024bfdcf6d8b8aaf8d98895e1d31db3c17 +size 776944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index baed1c5839..689afd67e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1cd0bfdd83455ebf4c4ecc21083d8430be1c2b94a501fdd3fd9c564e8394dc6 -size 798517 +oid sha256:b8f674aa284d8005d7cb903ea4c58837fe6f173feaa7c5b3b56b73308e178b44 +size 738068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 28cab10afa..b929f3e84a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b964cbdc534d90aeaa5f5f4bff7085bc7af63f00fd4e7871af1c6d25361d6cf9 -size 827521 +oid sha256:e9d104bd23c658c7d487971a9b87cb19b1a7808abba84bed3e4d2f12e5273f1c +size 759326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d61cc72f38..a5edf33454 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:574058617ad9f9537f0da060f23bc683b14db8bfaa3b86806c49f295b5a9a0f9 -size 789385 +oid sha256:3ac6ba21849e4bb26bd583972b8c84e055ff528b0f7d434e1305159db0b2c104 +size 727012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4dca604dbe..db4d1ef514 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d759baa351e49d87e500d83023192cbebe3fa66e008c6f4dd4cfff18c2d787a -size 815717 +oid sha256:d6c660e9b9616b0642751c29628987964daba4fcd0aae1f56fafc485d1a1bf32 +size 747374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c9051e3a7f..f112240dc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9dd7fb0c14ca57bda2c36a066befd31b9145c03b8b199d824378137d1ace48db -size 733421 +oid sha256:5db7a44327ccb25ef7ff72879749a54b72388b9b009c3d7ba9980f2361c2ed19 +size 673860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e3144bb05b..43298f5ea5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:684f73257de18e6ff86cb244de033627c976eae1619d8a73d1e1d62673df5390 -size 750891 +oid sha256:ed017942e0b1e8b2b780d604ae86b3b9893d3cdadfc98c3bf5671c5bb9b88e1b +size 696362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a8abfe6014..47b0958bd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9bf3804016d30cfe6f24f1e23a930da381c2528461c6fddda34eb7d30289811 -size 694643 +oid sha256:96623ffe805f1cc37550bbd34fe56bba82420b1368d23a3d9758d0d8505d8f9f +size 637646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e2e162ab2..85eda535db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70671971f8243c519bbe07cc8c211689019636fff270c2ec775bfcbf40db27c6 -size 908665 +oid sha256:de07d5e23f1ad97ac4286c79b03341e994f46c22e93b9ae44b7e048f96af4bf9 +size 824388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0d79dd8db4..efe15731ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0a72b7f50721e08936db525d812a300e412e098d580e02d1900189330e9feff -size 865151 +oid sha256:a83138cb12ee1f352a114a24505e038158abc9927c349f04181131937a6ebb8a +size 783834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6dddee56e0..1df5557a34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf22dbcd0df56f27a2e78beefcac885afd3fbc43029e22e901c1f0e84f1d5536 -size 890899 +oid sha256:383fe29be435a872b5e18602d0fbf9410ff683ade3d2bf0ad556d4119b42ed6a +size 807608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2db108924f..fc9670d7cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:882f2a51f0a4aea60a8a2c2ec34f610bb8c45fc1c552b13b94a5b12d13cc554c -size 852763 +oid sha256:2dd6bd04bda1002fa52326ac99752a0198a8a32aca96c3c5adf36ae7b280cec4 +size 773566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ef253ee1fc..00be5a11be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d76d04686f90f229ade312e48dcab1b80e12f0556490bff6a6ccefa280a08a7 -size 879095 +oid sha256:f4487080eead1d48d8559c9156f581d2be0b4dc524599be5cce2aa87ecd920ac +size 793534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3c567b597e..8078f75aff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f7c11a2e0efbb8211bda10b5dbbc777831d0706b519c27ddc338d3cecc6efe4 -size 752547 +oid sha256:dc2bcc037a30d0e1dbe7b370c9bf0ed9af01c3eca5b02da72e77ec34bcb642a2 +size 684698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8ac9ba142b..9361ecbdf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:864fa8b8cec0f6ad6569a50288d9e7704c069b781bc00dcb813c5ea623048ba7 -size 830451 +oid sha256:09c6f874284f688f5f010a43a30789c61c014db70de6eb080f952dde90f5046a +size 751994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e6eb1f726e..bc66f45b9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:892e186d86a00c3132c2c5eaf6d2a1e780aaeb2c8c2f5a630c788ce49b172fc5 -size 712289 +oid sha256:0f1c11bc04047deab3f7cf752617875eba5eb81d8c98956ef7a551eb85fb7141 +size 647844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a2427725f..c6c419ac4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb0b2916e1ccd7996512dee263579624d7aeaafaa16e518b81a188f552348b71 -size 861157 +oid sha256:f3942c104fecc269010e88896301ed77da417e6bdc22182d02e048868dd3406f +size 796614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d20c0337dc..f5c6a13a92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd3335ce98c6fd4a6ea6fc65b263f114e6ca46bcd1d88ad6146a32230cd059d2 -size 816903 +oid sha256:dac9bb81df5d2c146f304e156fb24c09fbd4b9c5977231f7a621541464f4759f +size 756060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6f4b798e0f..429df2c921 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28eb0cc6717ebe3d639746d57ba3e60222bcbf9104051338dd6bd3d1839771d1 -size 845957 +oid sha256:c7bf0bf4264280003f96dc4b6ebe4b10e6cd7cbcbb41cae3c5809a4f38f79fc1 +size 780574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0454548f6c..167feb556e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49ac014992d2b2fc4cc39a99a62480d5306102c5969240efa8c2f5fa156bbb51 -size 806981 +oid sha256:33d7fef615e98bdc89d97a3496fe7d846163411518a23b85580103790bcbe45e +size 746532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index efd2747d92..9659a4a7a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a57671736ab7ecea0ad2b703d323cbb1667ac5b2158e318d684339d503c274a9 -size 833955 +oid sha256:39b001720dc1e6e0c40f2891c36b31d1bdd4edbf9415d655f8f12f14d1c79dd6 +size 765514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 65c36a3fa4..1103111caa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d40e3a54bcaa9a27c2a37807cec45117386bef64ba001f48f7d09ff6dab7f682 -size 709133 +oid sha256:fc522257c1b57378d7e292029eefbc6e52cbf8c8283d92ca3f21cf9f89d3c949 +size 656922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 50bf235290..0f714fc4f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:095b0a13c4063e30415994eeecf194d747b8de768c44c0ff515096adf7cb5602 -size 786099 +oid sha256:a352eb8129693f676506450aedb2f7a427eb960e1f8ff497efe5aae103cf3078 +size 724960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 79f4482040..52affd0ebd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15c75af0f7ebecf327c41eaa9032199ef032b0308fa531d3895028f542b480cd -size 668925 +oid sha256:d7c5d3a113124da83bdf5cf601fc360daf9df93acd03978f80936616bf8c41e2 +size 620858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 53eafbee87..2148e98328 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9ba40e4e98ed576fa33ea9e37e625ecd7e87f8d93bfeb99150e783558dc8369 -size 752889 +oid sha256:0c862201f0ab0ac5d5ec4ae5cf159bff5be08cf8c1d9532c529d8879bcb4f446 +size 705216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 74263f6a65..335e7bd67f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a97a189a2d6dd9f62171554abfa527656bf469967bd26154c705c5f73c1e7bfe -size 737047 +oid sha256:3dcfe1607ba2bf4ab8db5942f9e6cf5a26da6684546dbf7b6ce9df1298f3dc17 +size 691644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4be89086aa..03d4970046 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ff232ca2b6443aeea7fe966b69f3b45e1273817f736d2578d4805f060b5679c -size 718681 +oid sha256:d71a9e678746e271161ec23d76213372720ae97788e971dd094173b1a184f696 +size 679542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 548189048f..8958c0d76f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25010322b5cae00530c23ffd2e26a5a2a47ce6a1581a4af27468490ccd73a662 -size 630267 +oid sha256:8faba6704c0fea69ce019a0ce8b9a901edf43f03709777aa67a70008b5689b98 +size 593645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 831cdf93d8..14899f00db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3b15eeb4b95487408daccd1f4ddcbf17a14dc6793fc16b0937926dc174005cb -size 743219 +oid sha256:f6ae3bde7411d71fcc518cbf08c8444524b4e24c77c87899706809f671c2bfca +size 698112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cda1d896ad..4d5d799810 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4510b8f460d2bc913aa1899c70a7187226b46a9433acb7488e5c73550529479 -size 728117 +oid sha256:1ccd09675309916b8a458b2105f39f50fa877973c38b3a116d7594b3c23974f4 +size 684490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6dc506edf0..0fd738b38b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13f4c8539bc06d0e55aef8edd70ede0e5c8c62d1ffaf54cda3a504250a1e206b -size 708913 +oid sha256:91f7d45494233a28d2686fafd962acd995384b22c05ed99e73c5c1bd89b04a68 +size 672390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7af9744b31..274d3638bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f8ed68eb4c0f25a9134c9ae2289d4512febd7ebefced8504b4ea18914240e4c -size 620597 +oid sha256:59e1c28a86fd6f9de4f3a2cdef5e59f93f01ac8eb2c079cdd5ce83f0f2f3fdac +size 586491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 19ae4e3483..cafc399109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:477207b553d9dce4aa53255a622e1a3cb222bfaacf488284e2b9c42fea0a54c4 -size 658855 +oid sha256:119d2a11b549fa065563ca6866f6b2bc4d9b4c46019f29860072a069a568965a +size 621246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e424568a2..b9dc03bf72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78a0c4a076bf454912412aebad04509c192cc5d455f363d288a866f3c7c058ec -size 632015 +oid sha256:b70370f95c82924d12bb23ef0db565de47acaa6402b1c03772d1d635d2a647c8 +size 597119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 30c19a8e09..e29a6e4816 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7fd8d70e2f39c97373aceef047157a21552bd6fd798820c1cea329a29cfacb6 -size 651795 +oid sha256:8242b1b34cab01d563732d34ff2615393475ac95eaef37306c69303e8a1c3be9 +size 612211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 11cc493831..d110b47930 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfe21fb8ab480aaccacd6ddf01fbe9783d067dc7c752033722546cfa0b5b0abe -size 625745 +oid sha256:53908688b9fa1f60ddea3950894b4c15c8cd3656b806f11b13505b56832e5c86 +size 590799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f2572bc661..3f9cb08948 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63b49f34a06fcb15d0b955cfe3d0d873b008bc9ee4c613ec8457c368b59302c8 -size 644973 +oid sha256:a7ff545a0ab42be58d991c9bc7be08df2cd4255d9d81a6a1d0a958c982c70197 +size 615601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86d13285b2..fe70e46946 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f01608784b1e8ea6130dcb3adf0ad739a88d6c89f9a4f24bf66c42744841455 -size 564896 +oid sha256:fa57a8fd0c4263c6587cc6ff4b11bfea0e309f84cdb45c995c67a02e63dc0b41 +size 532813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 28829e9083..6d71b16bab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe913dad89d5dd2b8bdc8e1bd6c00a3f1138af5c1587c233781035d4ce73d6ac -size 616110 +oid sha256:1dec28fdba97ade4947e41e5c036acfa5859569b7e731a58b836825e416fe82d +size 589699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a1f9467310..fd90b965c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4de0db76c96a7f10bf68db18ffbfb292a6c9b669b4eb714cce53641ffcd289c7 -size 535688 +oid sha256:e4df513cfd636d79c09883d552eee966d234f62b71becd766891c804c2934028 +size 507157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 487bf77f29..9f1353a8ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b04b3409eb7f252cb6d6ed9590eab297ca1b13b03e991faa0887df066c1c80ce -size 649975 +oid sha256:8812ee9351d2358143b86450ec389e7b27045af1595fc871b840a5834828a904 +size 614093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9c54e7c75a..54c5a1f4c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75158927e5927457b347341ad88bc31adfd4773e8c3582b6efeaf399a51e6336 -size 622347 +oid sha256:39887260fff57a1b1d9facf242aa305b248a8fa3dd3d90eeb1e45348467b3219 +size 589967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 47d63a3c47..2a1c2fef26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:814685849d9a48d5f1a4a589d36cc75fcf7adf2b4b79bb7ae865b85fe4d4b38f -size 642915 +oid sha256:01f5273e48cf97d6d60299eb9d3b09915ebf03a9397e565d265f7408f8eae8c6 +size 605059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7d9fc7bfa9..48754662ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f29202d2a2b95b9473a846f1a609c78893d14855a154fc72c84875ec34117a -size 616074 +oid sha256:a4a6ad8c342adda1af104ea6ba82622ab899cc1d84e665506ec1059327164314 +size 583645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e0c4e28a8e..aff319d4db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:433bdc4e2539804091270d09d2b533807a888e08ee9ae477e8a073626fb0c20f -size 635105 +oid sha256:eed82218f92e4cc957da97a2d44d89428b0b5d5e1b849fad1ef67ea44c5b6760 +size 608449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1a9c9060e..e32c298405 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f87fc2f78efd8e59acee8255a9dd3aed6fb069d66224342da4fc4bb392ac40d1 -size 555176 +oid sha256:c0a79df17d910008aaf808f982cbaaf0c106dbaae6bc3bf4e76f35afe11515f7 +size 525659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 206ef47f6f..4308faf83a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59470cd6e403414e4c000bfb16a36973c2b9f744af6def5033b9626498d9a37e -size 605700 +oid sha256:b9ff214ca4eb79cdf6fe2a5fa5640e07fac6b6178450d9001d6f59df5380cc9b +size 581905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 337152c0c7..6ff03b0d21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65ad3a7008935c30f2cc1826b60f021c1a75729e1d1be988514664ccbd073fcc -size 526018 +oid sha256:b887935be6f7061d1982de9a0fc8f21952352d928ed41b7b323568869e6fd08f +size 500003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0d8b7f6519..ab3e91eb15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1177d1ef90371ad7d4c7105c3f91393b37f8bc51ca3aadf6cb556321765593a6 -size 677241 +oid sha256:3d7c2ae585848fd78a27135315c1709b536f2b15681b1ae843e2b5717c069052 +size 641852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 32d6f0edff..f7f7bd4754 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d7addfe015a921b7b135a19b7958b90f3fe7b9ac058f8a568cf84f5767afd71 -size 649613 +oid sha256:33e889ce86837c343abf9bf311d51bce858de3194c43b7e8f0ea74ef0bc100d1 +size 603961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 64bacec598..97bf4fc3b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd5ef156281b5b4712f73da8a423e22d400d9a8c13c27ef99ae110ef986650f9 -size 669391 +oid sha256:d2ddb87d4dde40c2b1bbc220379e01b06f1528a77cf92881602af554516584ed +size 632770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eb5087eafe..9a125e9472 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9a92601faaa4bf7d916eeeaf54a6bfe5318ab08916924a4c6b5256b0fa499c4 -size 642551 +oid sha256:9283c1290b08693b4ae25360d80eba4f5a3fb770674fedee0424860f22f4024e +size 608987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0f5657d6a5..404f69fc5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20b0cfb01466f634ab9db008a450aa1fc71744254e5bca6243eb5bc46a1becba -size 682499 +oid sha256:f3bfdf2dc608e2e1ab5a8aabdc30a848132737caa91bc5e61bf7329d4c59c8ef +size 642030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8699c0345d..bfd0218205 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2ff473066015e89aa758e3b35cc3dc5c78f77c63390dca572e284ae918a836f -size 575240 +oid sha256:9d60e2bc8d207839ef83da149a4fd494a8987025b178b95c3d64026fbb1473eb +size 545131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 214c486879..ffc00b3aa0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:261eac13a8c438674d6a101c8d6c748444589d959595dbe149f930f67628621c -size 646977 +oid sha256:311880e83204556252dc04d0a4db6b5166e87f2bb93b495f106257dd15c39405 +size 611637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5f90d025b..3c7a850761 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f1769c2ef6c8fca88e8b82436f15ec19d5becb869c3e39ed53bd80448c4142a -size 545392 +oid sha256:69d6593f2f62856d597cc1178627a8c1dd5eb723f4accc28d66825c20edcb827 +size 518833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a66b7998b..e18264f640 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a50d993c018cbbef5a8ac7c56b47d62d0c62fbba5f607d15f82e3db71a4e194 -size 667571 +oid sha256:444c5ea5b98e32e186bf98a42bfc1f2fc2786451d8af6d572d0d6e1f1a7c85c4 +size 634700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 960653bf88..ce94cd8e77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bb900edb64372ab8e23aa89d75ed446fbd2a0781f8c81959e2c56fb98f67890 -size 639943 +oid sha256:ef84fba5a6e5d0aaf3cbdb17cdc0c4b07a8520fbeaa95e6994db1eca14416ef8 +size 596809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c76a43a4e5..596a9bc1b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dffc3d872f1526d6abe2baba93daf1bad19a69dc50d80d816fd158ff3e24b5e7 -size 660511 +oid sha256:a58dbf90dab741deb888436be6b05ea746a82275b9c13594697f3644f2174103 +size 625616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 63e1f52a37..f8627a9d75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62be89332c41ad57c29a07b14f4af747c61512e1547665b0ee0a189ebf98baef -size 632883 +oid sha256:c0e12c71db0592efb38a2af5854d32ff622756dbbe4b73e8c8d45583f92e13c7 +size 601835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8051018692..e4053c23a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e923fd43d9e1cd7331a6be0047fac9279fd9788f16a6017774391943ebb8f475 -size 672781 +oid sha256:53be88f895550a9b226af56c66dfd186eed4cfac54f534214ce5dfb0fb76e237 +size 634876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2b547e269f..e460df22cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba87cce796d0dac9938f5732c5e62af73faee609f6f2b8814182bcaa6bcc621a -size 566360 +oid sha256:ef08fa089b4b727802520e4e7515852dd6da7dd254a77e7c71806e03fc5fa738 +size 537977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1f29ace311..4ad8938551 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97dd54633922ce67482169a1cc721a96cd3e6e6c1cf6811445741849feb4a0a4 -size 637259 +oid sha256:9263d847633ffcb78241361c4ea43a502efb4961c52b08d2b7da13a82dd64260 +size 604485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c0d7121696..93c31eaa3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:349deff3477efc8aaf3f5e61af1460e17df422d9d9e9ac64a3eed9666644c43d -size 536512 +oid sha256:69f160ae8aa7160d9137cc84b03514d015ac055881f5659fc4b581d3225828b3 +size 511681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c0c1823d6..0471a36cf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5db47bcbabd446b565fda7cddd02b4b7037dd1e3edad4518bd545c213edb5d7b -size 1042855 +oid sha256:5a28f5c20ce26427adfe6f8cda6d6f024310df49d0c132e81b222c6ce3dc8a65 +size 924488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 60c9bb0788..5127041e9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcbdd5ef8e5f71245226ad567f8516fdf8aa301ba9c53a214b0232d55de0ede2 -size 935553 +oid sha256:be25211a80539f82e2aeab01991c66fe23e0fcd5d83300b57413510a24e4fe6b +size 861240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index b977e14ac0..5fc77f4146 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4faf78c7b63ed1645358bfc1b7b48aabadabc826fe96b8199c7316c0050cb634 -size 870483 +oid sha256:a632aa5340a8c245bf61529f53aeddd0ba5bf7095569c80ba013a67d95597fee +size 815116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4dd3c2ea16..326519b760 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9dfdb6ab2ee98434963380fc8dd20825aeaa8fefd16d14f6b31c2daf4ecb4b3a -size 991151 +oid sha256:96c2d2aa1946aa86575ab272116038e1384095fd75064373967536e31fdaa541 +size 877964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 50968c9e2a..264ecd1b80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b2ba37633ea798ddcde9d0e472af1396f85466958f9fdd247e75c24c352cd70 -size 888141 +oid sha256:b61a570843ab5bda8c88fb1eb345c1f16ed182435f3ba8fa47a25edee8e2fbfb +size 818960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4ecc405cb7..d1e588efa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f162fe155ea9153dc5eab26304930a592d813716dd65ba50255c6ea8e08c041c -size 767187 +oid sha256:e28dd9645ea75a939ddf2ea9727fe85dea0bb7441a44416323a6feadd901d0cf +size 708612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ad174390a0..963940c88a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3de654fa08b5994a20097bd020bbe45698c868b331be5ca119fb9dd3b4c6d18 -size 1032685 +oid sha256:6cfabba9f7307874353b6ed01d795fd145dbe9ccbd43c7cbac66180cb498a5cd +size 912838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d3dab98537..f602ef2af3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7736fcddb2a8841b47c612b56c857ee9dd264cbb672e59c0f34a770353f5e1c4 -size 925581 +oid sha256:f062c671bcf80470e2b1904d395f8f29b2e4f3136c65f8033b12c6d4d3e53cf5 +size 850332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1dce521d48..1b3c752d9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f8d0720fef988bd45ee8799992ad39cef0d7e74ec63b629981782d4a4544500 -size 985767 +oid sha256:fa74a7e0d7181153d75aea4a7f9cc1f85feac6d8385b65399d84eaa76f3159fe +size 872828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 3cee3696bb..047169aa8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96859d5f935bf32a3d94471ceee0c4bdeb686abc158001b2563aaf9a2c66b76a -size 882757 +oid sha256:d8d6ab0370f2bc87d28ac7783076d4943a0851f074d517cb6e0f7a536060cbb8 +size 813822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 77ef04f8dc..f78e0a3f1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dddc4c197c4aebb7e71e96392963fc3a98486e5dd5cdbdcb0632cf191917498a -size 1032179 +oid sha256:9c3722df604c8c3595ce02d238225b23d37378374be02395afa85c25c2f21841 +size 919928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a877428523..db0679ec29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:000a058e44e1763710395560d093c20b02853ce92b831c339f23055e8d15a4d4 -size 948353 +oid sha256:40612c38d7591e27b83dff7983fd675bf64a251c78a62eb3e2e26f570299b41e +size 830282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 5909e0fbcc..a899bdb2a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a03f20ea92854f7473b36727ceb794936c5ecdb4cc027f21a1ce46081feab721 -size 938147 +oid sha256:3bddd9a190ac98958bda388f94c88eb0256ca4f729e5cbf9e6bf2ace6c3e1cf2 +size 860480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 08ecdf4d25..f297f7cb0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:564873167b2c19f05e21cdd1fe28503664445f6a65a6ed8c93b774e13658ce36 -size 838337 +oid sha256:51709499dd5d68bbf2e7abcd235a776e62cab96c03741542e86b608de3903069 +size 767578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 504bc6e460..09b93d8ed0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f101743a8ebec62b56d207844b425baa33a7572db719165dbb4640c7156765a -size 810869 +oid sha256:af88d39efd98bd1e0de3f2d8522968149f7f6963969e92358a2e99b4f3da8c1e +size 727232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 691062d731..aa6725ec2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fc6f99f9fe4ede5bc30d4d292abb7cc4e3d8842279412e6fc069e42b1ff39f6 -size 729955 +oid sha256:5f07b66c54e7f29d56bb628d9930aeb561edd06f38cd6412be31e8b713c04992 +size 672564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ddbfa8533d..21b74fa8fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:374b263e00162d5900b1046ffcbb9ca47a68a0c8d012db058f96f2b139f6141b -size 989453 +oid sha256:0ceb8bfd31fb344e72a0480a5bd77144a45a93861290f573fca904a6f49bf2f2 +size 882334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f95e85bc7d..53a76a394c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc36a9899dac3fa4a32a77a01eb7fbde670b5d175f7970aebd1eb7e00163b5bd -size 906221 +oid sha256:bef0e9f9b6b86c17d55dc443717c588eec9c39aab00ecd305c7d89699a837922 +size 791898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index b94ed034e3..5df1448a29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a14f861599ead570bbe2794e97109168e3bbcdfa258d15d8749f3643c9daa4a3 -size 900701 +oid sha256:aa9257822d177106a9fbe619f31196052f80028259a89a551d985176eff49bc4 +size 827178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ef8407f71c..10078ef969 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5cf9568ecbdd6fb51c2ea9b4e5da88b90346530ba61df71be2eb58f2eeb4a97 -size 801435 +oid sha256:c6164a16337e1c9a4514b8b46d764499aa24e07349c5e340a72d9be7254c9855 +size 733684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 45dbf8df3f..5d305f7e30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcf13406ad96bdee7e61a29cfc13980b00402f020fa3705f1d7fb1f010ef0c98 -size 967079 +oid sha256:9f3c3a97c6fd3bab96aa9e2cbed5ef7b8d0b1ecc09e1e24e087dd4ceefe18ac7 +size 878656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index de5fd89198..1ec6e9f204 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:982155bdacf104a28e1f338192194a4e063f5e493e8b238732ac15cd3a1621a2 -size 903485 +oid sha256:83e6d3d0b2717206dd6c737d2b962e6149c7c0e233e22dac9b6129aa500c42d5 +size 838990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8461970a6b..46c64a03bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0530357b88aee629deb56a8a62c4c7f1f7f523080906a5b9710647ed4cdd2630 -size 795349 +oid sha256:f34f9f2f50071c83e36e6cbca2e44b6c3ba3458a93ffbf6d7862926306594782 +size 757296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7cdfcb607e..1e95a3aba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe3f076203432fc19f2a1ed26279e41ffaba9b0ef3b432cc1b02eb0bde024cc5 -size 917003 +oid sha256:4a8197cc40eeb335e09dc1ad7ed1fbb5b9b88e7b7ab3f6d7eff7880eeb477c77 +size 832084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index d7dfa0a0c5..e2ddb84a75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:554a3d8fb74f41bd05d557973c8d35ec119e8a264fd12b61b256bb4ba84a2f77 -size 856863 +oid sha256:1bcf2667ebed67f679387f4e6e043bc4e136dfd59a7b4c5ec40925a778f3babd +size 796760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 96c7340f23..315532f490 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a13d66f13609902c4934d69dc6674b0a749b9a452d2e1e34712fa1e389d9a7a5 -size 692053 +oid sha256:4f2bd764e5999a31d64ab2812a00aa3ef29ab5464dae5c671d54a144a1c16381 +size 650892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d2c6fcd90b..c03a6a107a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5dd2b937b9463c989cc141d6d8ebc40b6b9b307ab23eb2c1173561b66039bdc -size 957699 +oid sha256:7306015f8a4bc4636a642d1ad615a8b4a00c32b06e370ef204648fd2f056e8f7 +size 867748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 89cb55310b..578cc54a18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e83d3c009b01fa2de03cf53f4f48ba3f6902283a6ef50dfc824f291d20573809 -size 894255 +oid sha256:a15c3949a4d67b9fa2d25cc06f4cec6946c9d97a0feec2b1276226158fac9136 +size 828920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 921d09964f..50417cdcfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2929ee098bb3a4ffcff789e463f8a72f9905ff7345984a1d0469fb1ae2efef14 -size 910831 +oid sha256:c94aa9ffa311e45a7ca11f2002b8d3d821bf3987a6ea89928303de0e1e8e4172 +size 826948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 9f88f0202b..730dfa57dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faeba33f0285a5a5cd79ec83b576f598f3279d1082780dc71f8c3f5e1c79b8f5 -size 850691 +oid sha256:dba12dc1e880b65c58b7f592c88b83478af5094dbfd1d27b89057b1ffe82a635 +size 792412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index dcaa6dacd9..94d198c12a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5243433c1348462b4a1dab906c0db5462e3ad3c4568ccd102c4a9a921b145b21 -size 954775 +oid sha256:2bcc092f7ddcabc86cd39edd3a8f424646d8f59a851d3282465edbd076a54e6a +size 873062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 342c87b37a..f9e773ac34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:474f90db518355471fc4ebe0c408cfba9376f417180db93bb9801341a4bbb316 -size 872627 +oid sha256:0e41b505386def5845448631a01b5126f911910e7ef21d5bed55a8b5c8638f57 +size 785192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ea339d2fa2..a5c531ed42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b34ac036e8004df2586df9d897c52854fd745428010dabcbfa3a5e11a4874aef -size 906425 +oid sha256:020724b7385532f3de6a9d310843e688b63a6a8274c042006cc1edc8f521bee9 +size 839958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 24c0fe4f32..25cf35143a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de35b0b4bb1375ee7d76fd7191baeb60ca036cee46c4e702beec3c6953b275ae -size 807061 +oid sha256:c30c34223eb2c7497c8ea0ecc45d6bd82bab9468017881999ceaf795d86a452c +size 746166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 2756cd8a2d..7d426c0d13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f39a8a01259acfa656516c689174e6000be9e7de80cbe49de93f91ebcc14088a -size 725177 +oid sha256:17a5a00dfdb7811f8e5594814f75a3b7693e7dcaebff1cf302f4361534a33bbe +size 678886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 701fed3d9a..e8882885f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca7e6f972c5f8b517e238c2b081c0978aaa6e60518e3c1d16b9444538d9d4609 -size 654819 +oid sha256:42ced49a07ca5a3f17b708fac10b7029b5a13808e64ecf6d62888134253335be +size 615681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c3313dc0af..1217e127e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e21f112834efcb13c0a35f2c68a10acf3509f2976c268e047012749d909e0017 -size 913777 +oid sha256:9ac00bced9f11567e8879e66a55d7882d567338361e019f870ee9a01fada0def +size 836258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 92016c6bca..7f6da1e3b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa72e478290c8adf52dda652b0b946e97661f9992fdb825733c7918b7d50e11f -size 830493 +oid sha256:3c658a3b1eb0d07257f3ee17d8504a46aa6c687f4cd7f6ce42c91eb496d6df3d +size 746858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 0b6ca81b77..5b451ed87c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec9cb84e8ce622e166ac333cb95b1efb3ae23e32aa363eccb4b445ab7b43f2e7 -size 869177 +oid sha256:88079abe803a477978f5b516045366d2ffcfb410866a6402001a867ede594e8c +size 806704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 10aec5d5d6..d34ef257d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21e06839bf6a89babbbbec5bde5010682c2bf01a84aa80fa0f44d86307cc7aa9 -size 770207 +oid sha256:6fbfa418fea85e49594e6be9224c64f1281841f89c7e49586c6e463795680d45 +size 712272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d31b6f3d66..1579fd966a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e391905948485319bb22ab8a4ef26e1b3190493d3811f6eb659544a5b9cc2e10 -size 905905 +oid sha256:9a0d5645d1fdb30a023b0428ab2db3a2a8d8dad5bc8f1ee1879155ab0f715813 +size 852016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 54e9cac6de..352598fe8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c62868bedc567c3dfbfbc8dcd237a6e3d67d26aab7a9e6503eb1707b8bf057c -size 867817 +oid sha256:1db26c7f7238f42b4718a9aa5245b00d71023c717b62eef1935774c70caa6771 +size 814226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9397f213d2..029f45a206 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:665602e673aff456bbaab7cbd97eccc16226e221afd581fb1e8f2ab5933b6c39 -size 895737 +oid sha256:4583ed1b72f11e780eecb8f4339533dc080f1c736743a5572806a1d45ae1b9dc +size 840466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e64b0687d..a4b106db73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be871e1e70cf7ec4b81e620cb9a722c254de814530a1bf7e79ed9dad752e9b62 -size 862583 +oid sha256:f34119963c459097c6f9acdb1ca1ef6618c70ad7c871e0174d3955aea9ab6806 +size 810224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 27c117a54f..279df62c7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:679bd90aff1a2ee110391975038d960f7a9ada7e851f11a40a3c6a7285faba4e -size 904355 +oid sha256:e8efd00fac3c6cbeba41193d4aec4c52abb6213d223033f7ae08c40b78d6c084 +size 861814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0c115d6397..6a6988daf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:772e6285eb8b09df17d9dc8373f9acb046e51c6451e9f4ff7d345895545d271f -size 813229 +oid sha256:ef86cfd788433e3fd8c3105bb0e953fa2283bc0e5f9fed828836091d269f48a4 +size 761906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a2f4d3fbcc..c47fd4bd53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fca2356100532d48b2327b411aaada99b32def00dec2ed991e41789d4eccc9ab -size 872731 +oid sha256:60a8af173aed77f55397671c1890d93a0c7fd448f0be83c234d3ca28774f82e3 +size 832854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 29c43b5065..8a07e7c995 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f85b0288c1ea48ff30caf38510986f4ca24ab5421998878a41ce65a83348533 -size 786933 +oid sha256:cd6ee25f3d5a13a38c479ec538f975e1ed6d5e2a1c6efa919201070e7a46f921 +size 736350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 76d7411aff..a7112768a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ddbdc7bc8f94bc35e116e3a4d5d0fdbce0053317be210a0fea9a9d14a12ade0 -size 1144975 +oid sha256:09396c20a544a067f0f5f70f62647041ff447db6d09fdb805695d880d5e50834 +size 999968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 29368ce3c5..566ee0255d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d3080aa7824d5dfa59316274bec64b311cb1c6b0d6884a49553d454c644686a -size 1000031 +oid sha256:c80d3ce90c759a020a5f2ffed75e99b76a2c3764fd4902c096e1848ad3e53437 +size 917282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 5efa633f2e..dc0a48c068 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b99ad67531d24754a8c977d289f0da9037e57de6e6e986cce91a8f7369467f4 -size 995009 +oid sha256:f570ba1387b8b254d0c30cd8513aa44196d3144c2163985c39238f82a44f7d1e +size 932536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 693b998bd6..3dc8fc6404 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a479356aba4179610d5ab7a8a9d45c82af7f82fcd22a60e1f9afb48b1003346 -size 961059 +oid sha256:b13cc011682afe0c03d7cc5151df8385582b764f32f959df5d0594fa4db38848 +size 896712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ad5907ffb..dd397a442a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10ae58e13ceadf368edae99681aea5f35d0081761408217bd1e90af72ea9765a -size 1099981 +oid sha256:7cb66ddb7e71789d8d6edfc166f059e11bfb06f8080f7c0829dcd2022083c7d9 +size 958230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index fc4104e20d..4d8006a7ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70ca51dc1138dc02aa2e0836cd207b792b4da2de0ae8fe7f5d42792c6bfef61d -size 958343 +oid sha256:fb08ab46fc0d3eef536a8bb1787ac652abff46157453bbf4c626e6713a2584df +size 879096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index b7a536b5e9..283dcd8aba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3acd72cf9852e07f02c483773fc53f7f59b05dc03a9cc7f111c5d291eecda4ed -size 854811 +oid sha256:8f331db5608ab1905335ed74ee87bbf382c925233adbbb26263f66152e3b48e1 +size 793276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 71293d8127..4b8a970178 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ce995c6932e5b6a3b62146e642e8e8794b94beb92c477c8b07c44d738816164 -size 843605 +oid sha256:007be7e3ed8524cf2c0c883587421fea80b9301d0f9cff647601914d4fc0cbee +size 776052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e9e4582dff..b13bbbd2b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b16925bdcb34bbaab65a6994378420f7ad09e7b42280bf4c2f9566d235486ae -size 1131599 +oid sha256:3a67881ff3217fc7790bee3b3b4093da0bcca93cc89a125ffd439f2cc3d58118 +size 984174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8dfb02fbcd..1646a88258 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bed6556574ed20c3cbf25abf11533c7db26485fe559a0f0b3f8d3d1d58f99aab -size 985867 +oid sha256:1c2732d901acee7580a798a72609590d89ef02ac58a9ebd3a330d24287feb27b +size 900750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5c5be45abb..92da27845f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26c5e3dfbc6f1ab21940b43aed34aeca7143e9b28749d5d48fdd5a45ba4dcd36 -size 1090305 +oid sha256:bfb339fa8e8b8d48501e03bce51c84063e338ad117fb58021974bb3c34955f65 +size 948948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ded288b395..2b6cde8f0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca3183c60b9ec5a7fbf0ba320c5d0f228392bb1ac9bab9eb42b17171b5e62dcc -size 948667 +oid sha256:69970cc7257683161e2a9a8989c1ced0b3e74454566b399362ce39deba7bfa63 +size 869816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 27b287bdf6..3b0d3e733a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc9c7b3fc28b50eab2fe3d0916360c009d67d8939ff72ebfd25748ece67be5cd -size 1127539 +oid sha256:c0ad692fabb64518272e27a5a365e203bfe72fe97cbe6d6553c50c588a5003ad +size 998912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 968488a086..f2118f7ae3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5031b9181085faee3804193404764b857cd773b54d8c1cde50f94a1d647bcce7 -size 1044701 +oid sha256:fb775d2b028c53d5f5879e34d9821c0e59cfa6bb95456d9fa32e49390e8dbb18 +size 900828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index dbd43367c6..c51debe398 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb7ee49e61fe17d7c712927fbdeac492dc2591bee05116ad4c1a9c9d51dcf133 -size 999369 +oid sha256:f19e9ea62cd22ae89b6275b2c5c75ef52570a7dbe9a33cd4700204220a27e33d +size 916078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 2d3657b370..5deb50ec52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f9d0a385a245a414d014760d3913d620ac6452a64add2c27b37ccdde6fa6a7a -size 898573 +oid sha256:657f1c5e783c9a0cc3d49599d95a04d68e74d7f1335bc53270dee370c91059c4 +size 818046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 77c10df386..50be0a369f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67f5eae486525b8da90c126f336ceb4f3712d82a9b08ff97f9637998a887885b -size 914279 +oid sha256:f29d5985985849c235e51483d437bc57227e25502c0729121067c2b6d8b11259 +size 849242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index b1b6221669..1de3a17451 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4397188609eb64d0eea2b416a23ebc8cbfb9d278d3fba81f2c3aec275dfef3af -size 893355 +oid sha256:4bfe2f4fd1adca1fd1ca457ad2c624e2a4e8b43935a8977fe1c14925663649f5 +size 797680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index e237313a60..f6fcb5d7cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b29fa2ee9f3e5b8fd71642caf5c0ef5ee677db0946ad8654d2c122f24214ecd -size 815901 +oid sha256:a8a80b15cc6e47583460e262c7e3a2a2ca6db6d6770925b838cda4d38d43f676 +size 756832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c93ac2de30..507dc8078f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:330b6fee1364fa82280a5be6df7761b891e33f0a8916472b7cb727854b85d056 -size 805879 +oid sha256:a9c8acd3be337fe006cc262c1ec9acb081c07379b1224b9aad773fc2329991d1 +size 740002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1de69cfdd8..e919f5edd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efcd5af6831fc57fcd4f2fb578cbd8dd6dfefb2b5d5d90d3cc0f87a72f5f3eb7 -size 1078105 +oid sha256:ba7363d0e5b01fba31988f0771b119b6b65f88800c11e3f19b413edc369febe3 +size 954706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0141a17ba4..c1976285ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0aaec4f7da56e70e6237f056c6e8e624ca37eb9ad9972942407d6e200b28a35e -size 999361 +oid sha256:64a5636cdc4e13c9b2e85e9e28320d37cffc86368f13819de557d01978989d45 +size 860472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 8a2ebbc200..1a67ecbc41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0a201feeede6a90bb11115d38c36d8c44ed69ef4330c0a09d3f5f8130a1d48f -size 955559 +oid sha256:98db79be36d9cd4eb8350167a125a38c92b345f3b543edbc7eeade6836116199 +size 875524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e1aad55867..01043e7646 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00241f8e1d18a622dc132c8a3646443fedee6e86168c18fcfc9e44c9c1cdda95 -size 857575 +oid sha256:809584ed5e4890bdfc74fc79dc49684a75f35f5498f1014f6b8a79604a7cdfbb +size 782128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c92f1d050b..9355df2c67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:899530e87f5fdc29c46169e41aa34c7395f3f55c174e19fc19b5eace118ecf78 -size 1050353 +oid sha256:bba39abaed21c13486e5ba1c47dc3fd8b1bee99ecbc8b6ed8c75b7896bcf9570 +size 944270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 9493987ad2..9463c2cf6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:549739c7567d3ff1c7921ca9aa602f68ee26187758870bc44e229befacdc5046 -size 963869 +oid sha256:6bc9952ab68fb811733d0cc2470f9d6246d225d821b06e03dddf40302b21ec01 +size 891926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index c9b8371147..f071a8df45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a6520d2c617ef85bdb7f179fdf35da3bde041d9f505cbc602b0fd977e1cc4c0 -size 946663 +oid sha256:c793acdd1f72e791d9e358d027492bc84b7fb8d79a748f1aa7d6afe1a778bf5b +size 900816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 46d535967d..6e7e9a5b60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c869e23347e984a86a6eebe6bc9522771649b6812d43ba1e81175045e2083e31 -size 868313 +oid sha256:8189d650ea28c24c91920b8c29ec5716654a2643d9595a9237027ce072d104e6 +size 824636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 75433f961a..0651dbadd7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adceea641eeb2ead3f8a0794cab9d3c251858157d5fbd0f1483fca4943d2a068 -size 1005359 +oid sha256:97c3a8d7016fe175d288a4d714b80804c027092e56a0683970102efc6abd8847 +size 902532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 7eb8751e36..18fd76178c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45a69e01311388f93ff60c42f9b0bab5e60dffbf326c106347703edd6a7d75f1 -size 921441 +oid sha256:029e0d66bbf960ee43015f08e9544908f54ed77c1a225c0f742533a9e0194081 +size 853690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 394debf885..839dc3c83c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dc3eed17bdc3b747403da3baabd72d94bf2247536ce121b78a2557604092099 -size 806415 +oid sha256:fa117e47e9835bb5974a7e65c98ba77053dcffebcdb8659725753fc9a852cb3a +size 761604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6a05b34482..8255390f3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:505a28f64d91017eca329516c1d93cdc964b0214294d5ff0dd177dc646303fbd -size 750859 +oid sha256:2c6e103f38d0ec6e1481e2671af7f3eb5d6622cec6a743f16f8e5041ab1d0c5e +size 704764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1a274a6988..ef8a560b94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7b1ff0c3f31a955142e9d99504eeecb507dcadd8134684d0fdf1a0adb04393a -size 1035399 +oid sha256:fe716db43d6969643058e6942e3ee6e2fdd35aaabb32be00641ea7f02f93d7ea +size 929266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ddc4c540cf..606970f983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18293e5362cd80169efcfc19d4dc5a5dd5675b1d960cf2a8bc6463deb20e4d33 -size 948915 +oid sha256:c162c85bb727b4b433fe331317986ffbd6804c313268965a13ae979ead0e31fc +size 875344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7aa0ba7387..1ee5244cfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:709a1c3e20deea7f6838077a2fe09fd851fcf26fa2a8b3307f2180c28a8b0499 -size 994895 +oid sha256:8ab1576a0c222b93aa2093f8007105c7aea970f80bbcbddc3eb645b5051cd77b +size 893252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 7b009e50bf..c03dc11602 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67368f84d4dfeaf78e712f95239cc121b58be72809975c81e35aa2268b3a8e0f -size 912555 +oid sha256:da0b15ee0f78747d0d221569d8ae3e44289c3c7e0e823c85db30404016b85273 +size 844410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8ed2cf486e..9cbb42b009 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d557a086de1780390c53ccf027dc7b985cd48334ed9296a7a34529e7c582736a -size 1033313 +oid sha256:2c0481ad5d8c45622796c1ace95054727ecc1aac1c4d6f4b1bbd575a02eff104 +size 939070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d0b74d4617..6bcb01569a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c006ebd94d7c3a10daf5f1b1e03c032e4ea7776db3fb991d893b0273573c3704 -size 951115 +oid sha256:b6839118ed81d75de7705031e063388e5b34bf7206897a08e80f8115784f7a61 +size 845920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index edb731f5df..b9ecfeb872 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f801a8edea52d7180ccd6897f5a714fb0104757e704eeb33fa3cbf8465c2d9fd -size 962863 +oid sha256:199ddcbb4b5ba8f09372ca622e019f35b79b198e44cf903e9ef1382cf5734e1e +size 891610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 595948863b..603c8787fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bae85666693751c5e11b0b4e0395088b7b1290fbfb1c27582e9c5d23bd1815d4 -size 862413 +oid sha256:39b0d5ab1a1798435b582377075c2a4f49773eb24b6398a1e4d6e27bf5eb6ddb +size 793428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 4cbc464c62..f0ab1b3b74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e39c402122fcbbac759ee115c2fda71c29a1291b877ecc6007d69ba9cc7aa6ef -size 863121 +oid sha256:acce64524790fea6d2fd09b3b3d3c6f4f8c51506afcd452053be414a48f051c2 +size 818310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 1e984ae2d5..bff65aff55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbaa1cbc8b68f7fa6cbf9402dfcb331bf229a165ef39d9e3797339aa83938ffb -size 793651 +oid sha256:77cdbf8b6e9133f5be0ce06c52a28a97ef69d89bbc5440bc8dec5f1c3c04e77e +size 741884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 78c69be3b6..d99b78dff5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c1a2bbce9d40377fa0a0ac9ded3a309f79fd8e7c2925d50e1dd342e3ff4a9ab -size 767505 +oid sha256:3dbce68b4afd6e874214e761500a171f81a5edf0ea7ff3542804641dd87617e4 +size 725950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bbd72f11de..2cc2c4da2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84e5d5d16137a04569f458ac085b702015009a2059d7fecad73499bc94eb9d4d -size 712293 +oid sha256:f5b488bf6d8ec941eabbece7d4ca360f26e10f4e7b7f722e4cd5581bd0d93103 +size 670392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0d9bfbfca1..2d1c0c8f41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42d7794c6d1e30187016fafad1726b9a005a225d90d3067c2850fe8b01f8b88c -size 984521 +oid sha256:e369262597038fcf8225ecdcd0476d76fb4f3818f8099a5967c30c60c4319957 +size 894914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 844f01146d..82429820b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f0abbd35305df6aba7f858203439a28d997cdc22224173a8a79c1839f274145 -size 905777 +oid sha256:7bafc3e6d4c6a1f6e72d0df481c465d900f1fea940f5f891ddf63bce20cd6166 +size 805564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 9656671a7d..4b78681d4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b943c2d7f8d01b3b84715b42e101e863cc24db35fd207ab0361be5973acfd16 -size 918955 +oid sha256:571d12b17e84f231f2f49ef65015d4cce8e69e3c01f6c01a8f0c82c7f9cb1a03 +size 850956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index d32e2a2b08..256857fd06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1dc887db6c6087c2010aad89cb19cf53a485210b4fdecb2afa52f9a3f13b5de -size 821463 +oid sha256:8d78a3270456703116e9ca625c4cf8263fecce82a9591af06405b63ef2cf4d57 +size 757512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 292ed4f970..828938fc59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de2bc7045576cf3588cbaf89f51bf567c9cee8943beaa0f515c5ae0549f99f29 -size 1011183 +oid sha256:387a78b1ccfa115a9e2c5fbf2bdce4830b1a3c7a630fd32b89dca3ce7676da8f +size 936772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 972920d2b4..f749a82480 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0a06da28d613e49b437bdaa2e75db8181c1b57886160c23b9fa3589d536c4fe -size 1077691 +oid sha256:45e77c14137e70348fd3ef893a879e136f73851990c3f0185bcaafc37fc2f400 +size 1066280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 76d6ca28ee..e7eeb09092 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e886ec93edcfd4cd1a4b187c87c8ea53e1e4b27648077bf43107ac27049cf74b -size 978521 +oid sha256:590121c362635107c5ef440b84464c819f5010e9f502893cc923b2d6b0dd39d8 +size 906134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 3a53c03e6a..407ae613d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2978edc4b4c8d68f7ce39fa7245affbfdcf6653161624123ff5a70b46253895d -size 934387 +oid sha256:ef48b86c97b339061fcd29419f0b1a409a7b309c1e42253817eac96e0feca0bb +size 930522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 265b6b2a3a..8cab274ac6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33e8de5ef90b796aed876ddfe62d0db6485b7cc4368a6f578f290faa45ef0c51 -size 997561 +oid sha256:b258c3ff384f8d4f8e900e955c529fd18b9d35c10a5d8fd5eba5626fa34766f5 +size 920978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c8e81eb4f1..3a21043c07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa384483252a26a45a0e02e7c87da5988193991aa4902c8f021671104ed6ac4b -size 970967 +oid sha256:901ec8cd6c8fee73abcf23adfb9d02ac1d327ca2a6bb6e03c221878f5bd03930 +size 896014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4f006dd900..5c74206350 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ca4e4d5d75eda5fdd713695be655426d482157987ef352798b8cc8e793b0081 -size 993551 +oid sha256:06b445b615350697d53c7fbea634fc77963437e5ebe2bd03e49d40e35ebb4bf3 +size 933742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0ad1d3d5c6..706eccbd7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5360dd3d3d893dca6edaea5490d51a1f2f550ba5fe12a3bb75d4685a220a0ba -size 912191 +oid sha256:a9eba71ebe58cc8f37042880c11fb7e0bbcd99682362de41c7e667242ac1c9c4 +size 843306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 3429df83f0..1150723f32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7545e84e245c3d03bb180279fe3f8f3baf4126f7ec60522bd2836b5da1c60d10 -size 1064303 +oid sha256:39bef23168e11d2dceda1d328f10ff4e6b3466b6d805224f811558155422858a +size 1091222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 97d01d974f..39825f509a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d54cc9a27c35b993b095ce1200d129a75099e2e74af094d3a146383a8d4f134 -size 894785 +oid sha256:e4b15d7645fc737e209f36eb46477d963648805eccb1b3ff813f3b0629c695ed +size 888800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d58aa53aa5..12a70807e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a2bf52ad38e21408840f91d5e7fe6f3ebbecc37b26e3fe610b642b8cc73547e -size 961629 +oid sha256:21119859817d6ef8ff4ed1b08d432c819163e2aa154ac11453ff9f6b050e4bb9 +size 903154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 820ac75c2c..670c959882 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b50c34b4836db90556bdf7ffad0c85225d967c9a4dd87f6287019fbe86d40f4 -size 882393 +oid sha256:612a0de178282b1947d832bb0ecfb47161e4ac56ec508abf0c77fbc070a7ed7e +size 815726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a5ec883f38..9083daaad7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18b21088cc1b2a5c89c796040840b6927ba08f497836509e3c65c11cc5583e3f -size 1327113 +oid sha256:cf77c46067aed5301e0840d23055695377cb9a7eea821c2329f478960e022751 +size 1137312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 380461294d..ba0744badb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74468b06c54b381315277c5e749920cfd04f4a8eede34edd3abfd89f5e1ef3a9 -size 1125633 +oid sha256:f40f0083da0a0be5d7ae5340751438e889af2fc86da4bf5191a03cd575bc2282 +size 1027000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cda42272bf..640f59eaa1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fc99c50ecc97a4ab13743d00eb18419c807caf3ba8afd56b3263f01272d83e5 -size 1127411 +oid sha256:0d3e01309ed1757cc2f73124366cddcbf68b07f3b08e0c75e0a8216791dcab5c +size 1053000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c214a67bb4..efd69347fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b687190d3c2675fda9ad933723aae691b2092c23dbf230ccbab524b74f4fcca5 -size 1270773 +oid sha256:dd8a0d8f6f3261222cf5cb6c720eaaa52cd0952ac2bfd51cee06f1a614a11ec0 +size 1085854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index c4f1922ce5..2c29c3ea8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d9a073e2e2cfb44cfb955436605913855ad6ae23d447dd06c0b06f8d2358939 -size 1071809 +oid sha256:5aaab843f29b64767b56f6b8710e4eda0aaf7a9fef6c228f977d09bf6fedccef +size 979046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e418086978..d4f3383d6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdff4b1fe241e080a886889630e9a3c737202739f8d5cb8568b3ddd9417b5b0d -size 984599 +oid sha256:a2de20929f32fa57dc105235c3f0d3d3b898c90d0a8e61997837180420eafcd6 +size 903528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4726dad7e..2cd4973f90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5d39ffb9cd0c669923dc7b3460f3010475f3d91de297b4f6ab720274e6888d9 -size 1305105 +oid sha256:3599b136b0606532c3fa09dec398cabc519fb0e7d5f7a4e803216c450ad10c69 +size 1113280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 52dfee68ed..acafcc38c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4c3f3e09cdaa282fed3f9666f5a1c35de1c1d4eaa84d21de54e271dd29f600e -size 1102835 +oid sha256:733724a4d9fc82e0f7048c63ec1aff7cb7616240d1a75cd7d8141a5a6acd9598 +size 1002180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c8d37efca3..ca865ab57b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b029e17a748ed9e9a7885b085640d0b5378bd9030953f12220ad7a718e37b88 -size 1256213 +oid sha256:690dee6f5dcab9fa761e6d8e9a1c3e462b52fa6cd6ee7ff5ac8856400245bb75 +size 1071246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index b28ac7402c..b36c4f1d34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b75a09c54e5f67e95428f99742cc8d455c64d322d2c0f997162159d5012f54d2 -size 1058039 +oid sha256:1f5c6af4d79f9839e64d8234e5bbea4240cd67919717492fbc6f69cf2c978d36 +size 965226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 19a14b62dc..66c5b3cf96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0faacceb0fafa334a31ecd7ab4a90697e20449efd39a921d39ccc1eff25428e9 -size 1301095 +oid sha256:ddfa55964c3199d9b4a01892b6d87aa9f749c1ea7fafaa81638a4a9435ac373c +size 1128312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 952700306d..1e3faa80eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6a2c12368e5cc1cede94d6455a36087d28b91d2796a847b62bc9087eac9f5c7 -size 1220525 +oid sha256:8ddf04cd3469c35c0db1b2ddabf06b10a8c2a685b4cccb32a5197d192b8ad6d3 +size 1030920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index e6b498e1a6..0679a6b279 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee4ba13764006ac85d6c2ee1e0d1337598d1af6fdd390ed8ff6930ba342c2ff1 -size 1126403 +oid sha256:70fb92747c13cec69749d9cd95e8571bde250b26ce6a1c7afaceb0e6060a913c +size 1019382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 44a5f2d771..f8c8e6f5ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4222f96e84b9afc3bf184be7ac54c4b4094fa4a6fa55c9672dc5e33ff17c030d -size 1016677 +oid sha256:13eb558c9e5624ade2514381cfceae2bdec6c0f1daace92c155b3570be36e034 +size 920462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 20e648777d..c1d46b39bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b94acbfe004573371b63993079017a18c7163e0166a76f6d2606eaebadf81a7 -size 1024335 +oid sha256:88e3ccdf6f091e98535f7fbc1e03ab505aab8d89e1999ef4fbbbc3229eb013b0 +size 893880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9abd800947..9eada23918 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ac920768707048a9418796c77753cf70551784ec3272e5d1975af8362822e93 -size 947367 +oid sha256:436bbb354ef67898bdc6b07413b5c7a22a4a0d42bdac473392c01d64a39a706d +size 869896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 54f1bb0266..5912e4085c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec947373204c0438ec024c4b6a0b0f46f3c0450872895cd2ff9fe04782de11f2 -size 1234937 +oid sha256:1bf8e2553c6c12ae9db8926b83753d20011fe0271440371494a4488f47799485 +size 1074636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6203097a13..1db3430228 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7fafb1eabe8c15317d8b47c51b21a7dff64dba06397f8a8639f5ebdac8620dc -size 1168821 +oid sha256:c1298ac34c2960f2b5ab166666dab27c1f3e610e6d4c53d7bfce4ee27e300ac1 +size 984990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 7e391b8416..940d070f2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:094c3344fc75df1acfd23f73a7a5458347f977ec61b723ca8d0bbbf42b84e489 -size 1067547 +oid sha256:c84b6419ce455c2a53a45fb3a6b3a6bb7103e8a1f7b1d000c357bd9487e80b95 +size 970392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 68e08a6742..e0ce5c98ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:906c5c4aee31989c5cf1108aa618a7ee1525256a4cb14270a13f345324e9cc1f -size 968527 +oid sha256:44753452ede7ba14e2e8ae2c9681cbe98804d76b76538fec20abc5bf25418803 +size 878970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 690d113325..9d37c7f484 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7079714b4361b135947d8441a5a14c48920155843c471ef7e741b1760eb60c5f -size 1213843 +oid sha256:bec14bc3c241f1b7715f5b4adcc2e5b7b2e6f5dc2bf4c9919a136347afc0b466 +size 1076928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a40b8dc23c..0163b04b34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28ab8f60e52f68771786061bf11ad9c00b3f662caa94e718593972c55eda67b2 -size 1077337 +oid sha256:0b0334e429a695fdbfd31fafd90239341aa20b7fef1e5cbe5e996d184d20eb5d +size 994440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 392839f2b7..47b902340a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76b31b1fc6933367f01e236f54bf76f24dfabee70326d4fd8a7531de539e32b5 -size 1013353 +oid sha256:d264903d0b3182adf84ccf6a51df5087cd4d2ddb2b41e858c0f32d31a79b13b0 +size 960994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 93ebc4fd3c..68e2a2de24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03b68b3788fb2aafce16812467f0bd8dc00e41d753b01d500892b4055e144d0d -size 1157503 +oid sha256:b5cdcea098a9fe557f9e467f22ef06526b0aa42a9bbd39c237e315fea2420052 +size 1024730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 4a2542fb07..a4fb887775 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f4d61222433f6a54c51652aff4de00552b2e5175018e11627fada880573ed81 -size 1024301 +oid sha256:30b608a0646a1d7cf8d398b4c7799861b256f6c518f644284e05242a6720690e +size 946536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 00c0fd6faf..c08d8cbd2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a0c1479857dba980a524a968bc070849ee4b48b6ba8a00d69698332e9fca4b8 -size 869751 +oid sha256:e6d764556aaa7a4e5d485ce1238a13caf1811cf900ba317f98598e40a961e8b7 +size 811472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 33a424e5e9..7c6b5b45de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f146d04f835633dde03be8d82d9a7063103e9ed1ccb3e5b5d2b33b669c02901 -size 1190257 +oid sha256:f5a6c25fbac7d8b672e87d1ef12c0a729d75a93f0db4c316c7042e32241189b4 +size 1052896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8cd06df755..76b543e067 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec132fe40d7661f9775d34facdcf67682734ba39a5a316148a4617255f91cf19 -size 1054539 +oid sha256:774d2ca40ea8dc93649055e6af607ce6b298b7a74cf1b5908e832f26a9834210 +size 970408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 78ee21350b..137b720392 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f36c3fa94cdbedd7c67c6f075551846c5a973dba67c0d8239ba09b53a792ff8 -size 1142155 +oid sha256:3ee44caa166182470a17e6b07e962e48063f513e357577a077669b49a6e8e489 +size 1010862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 780e86a80e..0506b51e57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c37a86597c025afaf6fe9c78e4305fc133cf22827b8c1b11731693b67fbf7f4 -size 1010581 +oid sha256:2588207e9c6e84cd31f5679e377890406e5c16e1fa331a8d62c8d3e0f86ae3ad +size 932666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4c2f3f98e2..7cfe8bef38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c93754f987f0be8ac45711b6e95d684dfc9a766c9f7bfcb0845e03e1b3bd4fd0 -size 1188171 +oid sha256:86a3ac73abcb84f09baa67a0912e26c875cd0f273ad2022d8f7f2b49796ad5ba +size 1066104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9b51108334..806bd0af42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f07bc31b644d8a1ad518f38085850b77ab793be14ebccebbe62c629b98b86b77 -size 1107255 +oid sha256:1c35b251842ac54a3f2792c77e272cc69fac97c7dc8856ce1ecbf0087503cd3b +size 970586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index c0bf026b50..a007f4e637 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6ef05a31ed23613a295a0601eb4a16aaf4f84f27d50cf36b5c8917eaf02b5da -size 1077711 +oid sha256:a392e9331a6c33e4dce2c880e598df2e9ee35b4d48680d183c7e86a9c812abef +size 992002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 1e02981ff3..148d268b2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb720abbf2eaf429d11b262c52ac61e963440e118c1809d2873d1823245a0250 -size 969269 +oid sha256:ecf4468d24f42dc9602667334940d90f826b136335f613697a03e7cc5b1f8673 +size 888690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 634dafd84d..0901e1abb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e63ba0efb7106705592761c11b25d651e1ac9246a92d7689ecead83b75840fde -size 903221 +oid sha256:cce21f2c68b18dde68e27b966e6cfc1f8c1a6f5ef4f742ec113f8adedbfcf45d +size 833250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9e9776e2ef..fc415e522c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2eb785d4b9885020e6b304d46ffab09ba5d940b1a7bde5a7c6f2caa7f07e7fa4 -size 832519 +oid sha256:0694dbaed42eeb1723fade555c5a64060751dfeb9f1b6947c89c874a5b52b867 +size 777100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 213e0de26d..606d27b560 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4aa60b0ad9822c48e277c75e338291c3d8749188995a0ee219ad1d30f2be5452 -size 1124973 +oid sha256:a0adffdd75ce11486a395d62af7cc2b3ef765ae89022f7d79a9147910d4b75c8 +size 1013610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7496105d37..43ba1663a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ba1ce29c7103f06104eaa49fe0ab65857304071de917ffcad0454d82e756b47 -size 1055553 +oid sha256:308d4f66f6c92c2264bb2724def04e6a5f5925aefa31b6c76cb7f88d1bcb1eb5 +size 924654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 9c9004fba6..644d217faf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2281f17f92ae49ae979788526e33d4492107329f721b306d7e615a72fb79624 -size 1018903 +oid sha256:801b7977eb71c5728061e082573154837b12f167eb77b374e2b49a8807924ec1 +size 942618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 1f76a789b2..c8036027ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36751c93f6a3c6f2b04e70df82a0ae7eabceb8e8b01aa9318021cd9208d0da2f -size 921067 +oid sha256:bdecb8322cea486e0d6cf74fd3a3bcd6cadfaf8247c03fe3a7c75e46280e63cf +size 847200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 963f26ba75..f0d0069a24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c9dec1b7c96a089d82a434bd3799a59c759cf9d4b051c71360bb832f0127d42 -size 1134615 +oid sha256:fb603f4c9bf3273ae4049f7a3362aba4614374eed82678955318b2e89437644f +size 1069232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d2072c684..0f89a5cea4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b72db5e5679336e823303f69889f8aae5aac69e577a9c42abc3bdf21fb136c6b -size 1095491 +oid sha256:337cf8299a05df6ce258f6240117b16ce6919b85a7625d36fca6de2bd9af9529 +size 1031638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2afd92493a..19916acff7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cae9bd7e2f9676e3524bf1671ef775f71e1688f8ea0ebfafe0f7a3852a20f289 -size 1112359 +oid sha256:8eb471d74f5cc0ded5fe5ad05cf8428861ef8cdd2e90a9a5625c64f84eae219e +size 1046088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4b04307dee..0b9e8e4d5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:817de57bd2c3f1337dee11389978e0fb3dbb2fa6b88fdb1548e65978a8984bae -size 1082559 +oid sha256:52275d876048b6dd26749e81547d46684287e7690403cc430d6ac3e72810f713 +size 1016930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index da8377a787..8fef0e6bba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed05d7bc0f01b648e2561e212082e814ae23188a0c2ab5fbd5ca7807a97acbdf -size 1113825 +oid sha256:a3e77e2c14ad68b9dec7b23943f5e9dce9554c23106b2613dc9b72001ed157bf +size 1068718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0f0f215564..c1c99c07fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3aa74010c9690f4b3882eba8c943d919a3ba3291a435e888ea1e3cdf6e915622 -size 1032367 +oid sha256:89a6996f4adaaf75d118caa9397f188d315b873c8a18d8b6174f2f4fd81106d8 +size 969254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 72ac5c44f3..461f0e5585 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e2b256af9f7af9f9ba4dce3ff8dae9c77c656c3fa6138091e44adecd26eefee -size 1077711 +oid sha256:91805ef84a36b96448344b65b9941a9709eadb377297b46d96ee00c85816256f +size 1034824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 19cba9b81e..8cb196b76b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d18aa486e4d32fb2df3ab0fa1b7264fda3cc1131a84da6b09689eaca3af9b383 -size 1000397 +oid sha256:aeb79a04fd90becc62d5ab8277d912c31945829209286a4fc82730e8d9929d33 +size 938122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 48a3dca148..9bdc436118 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb750195e3ceb466e4065b9e780273512063a3485c2b05cc646b49b8375328d5 -size 807297 +oid sha256:ec87888fbf78c5d257a2d70173f7e74556ece1ad41c4fde41d99b6d348b923da +size 743888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fcfa0ffece..ec293ea260 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c681cefe58f065cdf4bc2648d5bae2ae4752ae1187773f1415743ce3d0c7ed8d -size 785585 +oid sha256:13a57e0a8dfd77e0148f607ae9fa6682278957792c8b2b28103d9df65d4be3be +size 743190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 941ec89c11..852cc81c5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2437af0b38efbf12fca997e402225d3db363af2098a9a50f3485ee8e570bae35 -size 753257 +oid sha256:2f7f437e0923584e45a85bad61c39ca2a48aecef6459b4b6ee0b8dce5956bc92 +size 715204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ceedf93e1d..0d71d55c1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc45a8a81a3175c96b61b47dda4610458da31ab3f3a5eeb43b1f4f0e99ccf922 -size 656063 +oid sha256:383f2e777e32af1be9b10a79b57dec9f9ce2ab75a4a541e8cf8e9461d6821921 +size 631428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 685d57d7ad..6f19855a48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1329ac9b5ab4bcc04c7a1754a1fe20d16408294c3fe1d5a6e3d6cf3589ffe29 -size 797627 +oid sha256:94b8f7448f943a6ff2cf7cc3dbde67e57ad7efb39699e1bc272ff81e8b79f82c +size 736734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 803ce08a59..12a4538be2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:111e3dd164bb40f04d8df1173719a13a8a30e34463dc5b2ac9bf7e3cddf432e8 -size 775915 +oid sha256:0d77a2e9ddbd28e00a53d0932d5821d4d5e6b26b066ac30941693e350703c533 +size 736038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fec3a27790..4e6769a82b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0bb290a727351d032c6a6224e603cb949e5f8ff7bbcb9cffea75effd731e311 -size 743539 +oid sha256:40967d090d01add94fab1a6da898f7520a2e8ed25ce0febb77d88621bfbef12e +size 708052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index fa0a5068f4..0cbc66c52f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3df8429e00fdff07fd9d61258074eea47d2f2a5fd999d5ef47aa70ae2aa9bf8f -size 646393 +oid sha256:f9dc06cce7b1a0edf6a79b26fe12cdb4104fbd83521aae7f646e8cfb49b8d4e8 +size 624276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 588a464118..25a8b78dea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7b88b454240c682b6d73f7ce03ffd7748f7769acdfc0b8f8644f77e360c2177 -size 664473 +oid sha256:e6f1edfb7b3f9d757b338f1b4302388dbfea92889d9d266c9958f467d0ab2e80 +size 628788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f5e58ff2c4..ddf633f44f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ed2417be42da7417cbaffc4cb6edc3f00bf1339f61c23f7138833dbc6cd09c1 -size 616864 +oid sha256:69fc108471b70ce977e520bf44a7f3dfa28f7bc6d266a2122902023e680127eb +size 585865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index faf09d9ac4..35a724eb54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed8deb30a5e603fd1fdb840968f9279a23352d57d4f1dcb32abdb78155d6a102 -size 654945 +oid sha256:5a1652a16d25ecc6644183a546449382a542f05057a5be1da0fec9180a3d725d +size 617731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 95bfaa2ed5..f4677139ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a28897f7846987291c316689b27d99f3e602aaae3b6c64ca4ef8e056d360f00 -size 611382 +oid sha256:be64d388683d26dd3861560dbb57b0ca36b419deddaed6500ea69b9d32249e3e +size 580335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 48ff04c1d9..21e8f451c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dba6adc2ca1b0953d74c2ba0d1601809df6e63e361d47bfc724212e67c6e511d -size 655425 +oid sha256:e3e92cf5e264cf1ec05092b13f8a96ecc1186a0b17ed7b4d4b497e9ddd968d89 +size 620776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2ad144d7c..415836a087 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:560b40c3945f07e18bba652ec90b94e646e29253de00269c447b34758f90b7ea -size 556848 +oid sha256:838d37843cab03d675810af5eed6b8b7a5b1841416bcf239c557f0f8ba1e521f +size 529107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 589c95006a..880c1004fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0823da7faa7e403fd4704435977fd4bb40894141c048ce4c31a77c7907a7cfcf -size 600416 +oid sha256:e30cf4719468b4e0825d1f288cb3100b712de4ca6a226a4b3f5262df6213bb2c +size 577311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 88ca2af89f..7e5735b2a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17e808c5f70cf8acb0936db68714e90d6a7a0bb4bb46d793314aeeaeba8d0895 -size 522264 +oid sha256:60c6d18468740cde8fc14950928adf633548b6003d48b69a1370026dca05fe86 +size 496693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1b9fb5977a..e4db658915 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cacfb255692f5ea283de64a672802515b5dd77a8568ba1b42abd485100daf3f -size 654803 +oid sha256:f4544fd1fd61c1ca9fc90960ebb7c14dc447d40709c3abd5580621a09c0b35b8 +size 622424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3779d8182..2db613d5ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d499e1545da70123430d56bc686091a7348d74bbf74e8915a962077867990d0c -size 607984 +oid sha256:2cd637e246feef65e8b7fcbcf0ccdc5f95d5d98310cad2f2ad582509d7eeb8a7 +size 578713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d18dcb03ef..8828030931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cc24be6e406af32589fcae6ccba42a6d1bf4344056c2cad6becdb15fa68fd18 -size 645277 +oid sha256:b1c98a01b8c3f8f6ecb867654072eb968765734c6d1b21f5e03ff81ac7e3ed78 +size 610577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d6792db3a..2e8934f814 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfac97c6b1996c319479f3212788d1b3844f84c49ff5d5acee693db9bb6ab3af -size 601712 +oid sha256:6d24fb8172de4d0afc79d87200a75589607eb4810767f85faaa2c730c2ea5c46 +size 573181 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2f1f1e2c0f..eafbac50d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:512d93f7c94ac1ea162fd117d8a7b6033738dc13b0b960be75661b07cb2b8e34 -size 645707 +oid sha256:ad296002bd1ccaaa3afed9d25274bb0bd847ed5f314b4bd69e3a7ecd8d85f02c +size 613623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 76f282c83e..81ae8dfe1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4634306dc7f1f3595286839e5ed246829bad2f61cc5f2d28a8f9a2712e309b1a -size 547178 +oid sha256:99d51951a801a2f9e133aea162771ca414f63c30a6a73cf6e68013b05773fe15 +size 522003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 477bdb9c50..cfa6b239b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6073185006c6130737a9ec1fddf00901f82be49fb9251e5081a7be4217c259a6 -size 590648 +oid sha256:177c84be28e8e16c60b5dec8a3e12c63217632bc0a29751638dc0083ac36c226 +size 570059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a817fcd69..52c3bb74b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74f9b2db91d0c718129c72788ffc75b1f62d69d12ce03f61109599fc1dc947de -size 512594 +oid sha256:34e3e9749c953f948f18c19123f4bbf9264246d1a96ea1b3bc82bda4d27cf0f8 +size 489539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0137a4a35d..0ad88a0e34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17c213f64fd6a0ac7a6fe336363d80d3ed67bce7901266964d7d54e92173ced5 -size 682859 +oid sha256:15dd1f6de66e1ce3c37e4cee71c8b8dd5e985e0279e9aef1bf275920492e21e3 +size 647520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f48428c072..d61bb22b74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a77c801cc02deaea7d62fbf7c887d8ff098fe09d51232d91424033013b1db400 -size 635251 +oid sha256:bdee4653191e9f5e3712688c863eb82f0b43eba561e31d55bebbda05a0ef620e +size 592659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d9e28c9323..1f1ea2a032 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43f687311e9a59d53656bb9ae5ec892ae88acc8cc6c7731ed9487982058a48ef -size 672543 +oid sha256:0e4816e343201f9a03ccc18ea786a0b686ac17f20321ee8467979ef43c3cfb05 +size 637204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5916f2bc15..a003ebc850 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:767d0b88c8708fafd2124a0ccec0012fb5a6bf0954201fb1c017a76bf13bce97 -size 629029 +oid sha256:307a4e9668daa3b7784bf48f1f40f4e8f789475a6608c4fe1c0462bcf18d955d +size 599213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b1ccf7b3aa..0ca30e591d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51e899b88a16c3060bf9b1e6fb121b23cb06cbf82e92fa2e286267bc1c57dd06 -size 673663 +oid sha256:390b379ae1751100c72f8f687aeff748a3ee43dc6dfd859c5ea4d64b82564c97 +size 639558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0781af66d8..6ac698b8a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6eaa999f9dd6c33711199f89fce45cd14b404b57de1105830d80c22709e048ba -size 567340 +oid sha256:c51e95041fd32dd29209a3caa516a56dde1c967a02ed36905c8e0c8011faca7c +size 540783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 24222731ec..120e848984 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eced9c8faf1ed6217ad4a9ab377436020adfe6f7c25e235bbf9a03d58423bb72 -size 633307 +oid sha256:d902bc72f515a432fa7ea3c692dc4d847fd004448ac5bfeca1a8940097221375 +size 601963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e5a8d1abb9..a6ddad14ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f712f99a07fc7b23ddd81b197a9175ed82d33bc7586f05731d609c15cbe0816 -size 531966 +oid sha256:47f2bd9446b9056a8224c8a3f7934aa54c5ead63a654a28e1ed5ef1edf4ecd45 +size 508369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4de4f8db67..b962854468 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93498cea7c5d7d385dc2a4fc012e4be048e4c1d4d474ac49828663bc8ae69685 -size 673189 +oid sha256:744f7d6bcf94fcc225a85336fa47c08157231e8e54959049818b49adf53467d6 +size 640366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5cd2328927..d827d0f74d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da76b6be9844850c57306655d8d7c0b8c0181292182a1b5312c0e48a012ccbe4 -size 625581 +oid sha256:a0965ccd8189ee61fae9f26907e6df575bf24e7394ae362a6d6034aba10438c4 +size 585555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a253b48acc..8bbfa85c1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48c29b87abf944d89507ea2927bc331e7bf1abcd76c2946b7e82a0e7a525b0fb -size 662873 +oid sha256:33a39ff77610a89ebf447951c6a33f2d738e8cbdce1ecdb98bf97edf682e5eeb +size 630050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2276195f58..ba6886d68d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a0fb6f62072779d55a4ce4bab1d930629d3c523dbd638497d8e515781a9cbf9 -size 619309 +oid sha256:a352f7a58d3525d0a3391c364607cfdfe0d90829141824ff6d5dca2992f14eac +size 592061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index db43868d23..1c0f041eaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3509a23ac31c65377735c3dac6f277116280e560ad7e092f7315acbf87559679 -size 663945 +oid sha256:e09b34b323bb4f7c999a0ae1a449a3b363520f69e66246f46f560aeb9867a676 +size 632404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ea8f6f7d00..3129ae156d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca947a21d7ee97aeb1c6b07ce9e47efdd715e3557dfe7750b6e0706ba1b7db22 -size 557672 +oid sha256:8855c6b8ffc1aff70067a0ce8fb12ea235518be414672a9c08105d0e27c20bed +size 533629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b75e3cd1f7..52be8b66e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:276972a65ab532c6b43f35693d12b16a896d301cddf615fd8948e2100383fe1a -size 623539 +oid sha256:2faef4a646edde8c27a4478a3637fbf4ccf1f81251331a01af7c1b66fc87b265 +size 594809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 18f468cf80..66cfe4bdeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61d22edaee3edaa0dec79d40e74954a1e31fab3c4ce38a1ac19ccafd9bf05266 -size 523038 +oid sha256:12068032ecec0798c2f69f5361bea5704ab44ba4fb6b744d04d6915aec7754e9 +size 501215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index af0f2b5fe0..08817143e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ea1d3d35d00f267247de31c3017ce8a3e0e138e4b6a71bf8154142fc1371c4a -size 778877 +oid sha256:d136dc05b235e9b65e3efb43cdf1b30b73fd6463540b2c91d038c5e41d90fe42 +size 726568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eec905bba6..4cb2d36572 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8e28b6a55ddf049cbd5cd4941d114d32125b0b892439c501160fab5d5ed4a8e -size 737139 +oid sha256:f06cb07aba3dd6f82ce1e751bd593ed004029c10a4060b3101665b5170f3bdb1 +size 688628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8dce3f42a7..0e7c0b1f81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66186e9782215abb75aa78f582f71a7a3ead04c09480f3020d193bfd9a4b2a82 -size 764417 +oid sha256:1ca3bcc9be8a53cf38fbddbe76b538f376d6dc0c62439e4b8fc94e5568c6012f +size 708998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6915865912..3ab48c57ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02ca742dd61df64c2f85bfa5a4a37999724d1d6b71838a1b21801729392d518d -size 727957 +oid sha256:468587f9ee1c51c3161b395b68c432a4c90f6a7907463595bcd706e38823f510 +size 678360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2a74c31be2..816ae5a964 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2477ae5230161fd446a069f9ab6fa98b1775d3c4c91ef68dde82eae84e2bb17b -size 772591 +oid sha256:644a90dbe20f11ed36aca64869286bf400456ec4e2a6a0f0f507eff3567866fc +size 718456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 860139dd97..4c266ffd8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4373571a3abb65a6c170ec4e0953414c1330f379ac8d79196c6220030b70a296 -size 665135 +oid sha256:fde4add931e0ac2a15060f13be5cdcb4d31ce399b71099a2408e8072a4783631 +size 624666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 23c8fb25a2..a4d67016d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a438e80aa8036f50a602f0999ffe3cac1cb65eb06dd3c9628587de53d2937912 -size 706631 +oid sha256:b966d0209b009faac0ce8740630588a36c68c093118dccf80d5a14ffbf3c6ebe +size 670206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e5a184496c..647ec02572 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31007b012a189bd83ba5f42ae6a32ab8768c24e00baabcdc858584c9a8c66d59 -size 627345 +oid sha256:81f022c8e0dddc6cba8ca56e345f1b87409220300792ccf3d09aab9dea38318b +size 590229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bd906218e4..8518befa1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96eb437b91b51472f439903775ecae58cfb1e19712a92b404179ae881675ae17 -size 759489 +oid sha256:e59626d1d90b6742ddad0fc4063148299c957f1b04abdec59f081aada037a9f6 +size 712260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e3b91a16b..ee8d239b03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32bbc9576ce04f83582cad7ba334c031ba04e5db10ff037147f3baa5629aaa5d -size 717751 +oid sha256:6f17f0b9b1d381f34ee812fcb6d810e611db27acf841d858fa20892b1dc6bb19 +size 675160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5168cf47e7..00dce22894 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b528ce1429f52d066f4d768bd65ce4fb10983bdfebf40125c1df1868db4a0125 -size 745077 +oid sha256:b8e3c9e6735c2757cf1903b874d063757236de146dc85c1c32e31d819e52f664 +size 695482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c1d9312892..1a742e3ba7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcc51074a3adfaf729865faf07c1a9bc97b170be47eb910cdc2555bfc2903dd1 -size 708619 +oid sha256:e5f93247398940d5f9416c7ca649137b6bf1e7c4cdc1c135e28f1249e54f6e4b +size 664844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5de6a36c26..219114b0be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82f9910552b0de35a73e553f1945465bbdf6c63f18985a972d8a61702e77e84b -size 753155 +oid sha256:e0ac6a24ab661b204bfba68b65e8b70c543609978ea2c3ca411daf2b3e18a524 +size 704988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4acb0ef389..61e2f4c28d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d71f32feb29c7aab283f073d07aee6e5a74c0c3f552fe73d1d18d1ea6f0f408 -size 645747 +oid sha256:530218ffeca373beb60659ddbc9a6fe3c38773e2dcf30cc3bc3640cd213e9189 +size 611197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 44aed07dd5..7fd1e01689 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ceff53302debd6f821e0f6fddaa94a183452c3a75afc3f24ba56042003936d3 -size 687045 +oid sha256:31cfc2642d6051ef3c70604eb99d780fccedc16094a33764a33737556ee9ef1e +size 656788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b16db6ebf9..71e06864a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92b74e502b4a9cb7780c50f1f259831940c7057c5542b1b57f7bab2a18ed6a31 -size 608744 +oid sha256:c5e0fe1c851269cbe1a307234556d41cb38352972712ca29c94713e2a72857f4 +size 575971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d5e2b49fee..a67cd32bf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:720638bfb2526e50b234e1bb3f22988613d54bf77ee7b041bb3ebb07efc319ba -size 798053 +oid sha256:c145d5ae7d8cbe376bdf9ffb9c012da35ef8409d457673959dbd1e291000e599 +size 745398 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 12cb7c93ff..73b5dfafb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c990fbddd555612a545f5874e76a8df68a91e4722c437b725d151b58f3c399f4 -size 755525 +oid sha256:744e017c947a0532352fc5985398c3b5d3f587a680485a6eb2679994f6f3306b +size 707408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 198b805fcd..17c96302fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66257501571d759dbb06cbd203b09e823eca079ef6ac9187e00b9c8afd9e1a0f -size 783591 +oid sha256:b7be98c2638865dca9a40b458c8efd3b0079dac762e5999acf990a05a61722dd +size 729408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a1cd0c85ea..6618a8e1b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:494154fd13093f78edd9c994bd66a1e045a3c99e131c2eae8edd461a82d2151a -size 745603 +oid sha256:27b11e5cc5606661521b313b7871ce538e411d67d1dace98d4474e5503cde0d8 +size 697882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8169ad7994..4871ffcc0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb295607138805db482f27ba7eb6bb7cd7103d8fb8253699c589ffd789c09531 -size 791421 +oid sha256:f1ca6e7b96ea46fbae0de826c8dfd7034688e9522c9e188707f2abdb0a54e4ca +size 736448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ddeeb84ac3..e76c71c91b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4601af2d0b63c601a4ec7678de31e405052a19d2b05badb583a2f7765e6cd8c0 -size 673409 +oid sha256:c621a75e4423a745cac360c4572c5f6b9d2b2666e569483661cfc271829fe0b2 +size 632150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 825a18a1b9..cc62246d36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31277835fb8253e0a56deaccbff9838e2d834646665c914f43348dc7748233b8 -size 743615 +oid sha256:b90f106095d3f763f9ac50c5253a6162ab57fe95a607c5b9e51e8801ca55b98d +size 695548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8ddf29c128..b46f5e6e2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a50328bb792e0e70cd6dd2bae20b9f9b344361993802923ea8ccd5a8a0a38f3a -size 634827 +oid sha256:a391633e7b6ac6ed642a0a9bca6efb2c6eb998aeb5484d75e6fa857c9ff6adda +size 597711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4bb162d089..8376e9af03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:387aea135785fae9b10044a72c1cd4522eda9f89d3d5e800a7ec12041ff95104 -size 777925 +oid sha256:6f4a54a7b41435096d22c247da9f575529f009611d47096495976c5d0f23099e +size 731880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b5aa0dc7e5..207a8e83c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a61002f21e8969d7dfca4909f8075620a04142e3da1e7f7eea55ff3e1d92bbfe -size 736137 +oid sha256:d0b7da78fa68a5a915b0758c8e1571c65d332e5da60e37a0d6b3aebe6b7fa8c0 +size 693102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2b69997777..6c88a926e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b50bbe4c21b676c4c7c60691dc3301db272ace25f42bb22d72ed810e90b53904 -size 763463 +oid sha256:01f23dd63a6c6e5e200c3b4ed4b4948df2fe1ac5be47bc045d73611c56f862c8 +size 715890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 55c7fa30f3..ff1af71cdf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53d007ba3a67407a40bb9c66eb651619a5b06eae435850a72300b5ab648a83ec -size 726215 +oid sha256:9ef77ee88902708b38472ed207b5b4efe9b842d45dfc3e46fe018a1338e3708e +size 683624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 24516b848f..e1edee1f5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13cb1c0b8f3d986412d2e7f63d2df7c90e70e3e7a7d8ace81c95713e5a664a1f -size 771195 +oid sha256:94ad46a42c23096f4639601c889db951abe8873e0032bccaab6bf4b7b7fa4228 +size 722930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index babe6362e4..5727d70a7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eeceff9d7fc7f67f7101470d40343b6cf6f450c1d239fd24ea44b0f9117b74b -size 654021 +oid sha256:2a1f2273e124eced05074d89b27d70fa4c8f0dfa2cda0615b65d3bf7cd081ef5 +size 618682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3a09c74376..8f428ddacc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb8e5e555724068a26deb2c5c7d78a8891bad92e9045f6df93e2f3f22dc3d370 -size 724129 +oid sha256:909356999f66babecb542be1f3a3d0df1dc8f5114f9494ae559ea497cfed984c +size 682032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1d979de787..7600eda9b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90f46dbdddf5a650498bf1600f1f816fa36ba11990dc22192cd81d2fc10d4d4f -size 616228 +oid sha256:9b6ccd8268b583d649b86ad6320b7ba752c5faf33fe93b6337e725b793549925 +size 584195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 471cad14a1..3ed389c72b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c2b829c4023e1012512c36ec20a41b27375ed485f90a4e1d9c5f83bf86e7620 -size 740005 +oid sha256:bb8c0fabd78224a06d257a4af2a401fd646fd3d28108c7ab4c0959b044aed635 +size 688386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ae48498123..264d0a9586 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e92f71ef1063767ca4254795aaae594f568264d97b5356c0a83c2ba726acae36 -size 718489 +oid sha256:e8c172c7ebf9318afdae556b2066e3fcd447e9e80f71a7cd35be72900de13ba9 +size 670028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 55f3f72515..4e000dcbc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cea294ef77005ff9421ec24bf27323c42bc6e9508a5b4227c669002d303aaa6 -size 698939 +oid sha256:901b24457ac923fae4e0450ecbad5a0293db82694e4d7508f5f3afad42d49387 +size 664586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 51da8ac9fc..0834c8f16e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:964c13aec1246a629ee3b3fba4b741efcfda08555e3d9529ec16e3fb8c41fb43 -size 604900 +oid sha256:ce5086d62076229043ec09c3a2996ea2dbeeec8d666347096c701bbb7b5e6150 +size 577061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ba4e45076e..902ebacfa9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08415038079827522a73927a7093bc2c8c9dc70ab0e5bfe77caf0185b3ae9006 -size 730285 +oid sha256:e5d4e4e9e2fecfcbadbdb8091bd9c1e766d32400059d3545ae0faea96c2d4384 +size 681232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b888815bd1..d2fd6cc262 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9e81e52ecde6ce7548d55e1fee6de16a3998332d395dfb4fbc1ca1528559f66 -size 708771 +oid sha256:2240b28fd6abe0dc689d4b30260e19924c76ae8097397352597a8f0d0380e62e +size 662874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index a2da2fc50e..c6434a3ef8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94f62ba2b491b94d93053fa706fdf3e2995b95df780538d360facdc4bb667828 -size 689171 +oid sha256:5525fd3de37678c5023729c7837a9c8be708213d691dc648ba9da3eaaa3fd1b9 +size 657434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 785248669c..05ee20a369 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab3582d1a5045878a7bd0beb6d219297af74884969be168633094f8db29bf79e -size 595232 +oid sha256:2d1057c3388fe79af3e2fc425d600a074ac265a96ad1f2292c0bb130d8b7dcd3 +size 569957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 739840347f..9cb6c95fcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:131f2b7a896e8e6877451cf06dcf7a4dc6355ea6f14ab30e12a460e446612181 -size 642617 +oid sha256:8a1b36e60ddd4e1f86865a96e55e465f85d8af47e371ed26863d0bebd639da9f +size 608559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d11e2c7871..346aa854fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dabaccc01fef099ddf30711a0f8b697202a3b40decff2ea69cbb77606e92867 -size 616714 +oid sha256:9534766279c5bf1935c9ae407b68d4905fd8cab429a267c3858238b01732b37a +size 586061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9d5bf7ed1b..0a6c6e0fc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a9566d03229d3fc42b64b43e0cdf845e7cc3ad3a09bd4fac32d2bec4ea75cf3 -size 635555 +oid sha256:f928582a5f2dab42b449d40494956b93e51db9e0d8ebe70d2c2d13377f02e80b +size 601795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eb7acfd38c..b927c19fca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ceefc69192f053890b5e9dee785af1ebc912f6d35a12bbd870235624ed97752 -size 610392 +oid sha256:13b81992396acc9292d836beb8f65a3ac41226e7c34ddc8584d1db9207591aa6 +size 579741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6a4ae2b253..1ac5f1adda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7a61d6c4d9bcf7b99e64e6bade8443a3278973fa37081ce2eb566f76e6c9ed0 -size 627993 +oid sha256:8663143c501e74d358c5cfc91f1c248f6ee435157f4cdf0bbda4ca17f4e8126c +size 604099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f923e36ba9..5be87ec8f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15c1a6c592f6455f6d34930c791192cf87185414f6810e267072bf2ce268ea85 -size 547768 +oid sha256:0462e57a5769f86d8a9a429c606f049a1185cebe245aabaef2c7689e9a2cc546 +size 522149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3fd645b98b..06d38ef1c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:215b5d3091bbef24acbaafab301954898a391e75e5e8a2e78318648fc7c8cc49 -size 600068 +oid sha256:73cc9c22f76715fe69d5a98e0f055affd58c4faddc8dab083ff26360c9204f27 +size 579283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 922b3c7775..cbee9a678c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca4baf01820c282aae16b2bf0815bc1c6788ff71b05e8c70eb7ee7ef5e6abc0f -size 520486 +oid sha256:6f9fc6872086b4bc07975d0814459ed80aa3f490fbb1c34ee2a43fe5477f0d90 +size 496691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4daba201dc..c3a2388a83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab49926de1e346e0ce587063f908fd248a64bc4703481329469fc1d28f1325d4 -size 632897 +oid sha256:d2da1df27910c9314d59ec4359c963f0478c6edf7f9112ea0e9bcd9944b9c1fe +size 601405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1bba405d71..93104c8879 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef1650a0afe4c9be7c370d0db4668375455696cd762db8eb4d2108315a351346 -size 606994 +oid sha256:54a76be742eaa111097b417d89e042273f803281b8e3b2f16af0cbd9702e3993 +size 578907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index db66ca9e4e..1bf444a46c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd16815bc0d9f156190d5f155a17897e3e74b21e0f71cfa9c151fd890e7b0694 -size 625837 +oid sha256:bc08cb5770b16f1e7af8a3267267f60dea423305e3e886179ce6e6e8efd3ba38 +size 593851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dba86d6c9b..e3bfd57e56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a569dc8c23d6b1149d04c581617954d9c42fa7734ac08df1531ce0170659294b -size 600724 +oid sha256:c5e1b10aa98b5680b351acfc4ec176a4b15d394b99813d49fd101d338bc1e184 +size 572587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a0f82b3c6c..63c4992f82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ca96adcc1890ca4d23c105725fcec400d884f76e650febb7628e65ae52da910 -size 618177 +oid sha256:6e10aadb3a5ed78090e2ef3eda89bf0c84500d597167f4d7e8518aa3b27e466e +size 596945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 48cac2d063..54f5ea4caa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efd977cdad120b67064ee344b23fd3b08d145d2acdfeaab4658caddcd3fd9598 -size 538888 +oid sha256:9cde6a0f2e388c08e12b854fba0a78b9059b1b93ea1e2e8e9e7f465c4347c0f1 +size 514205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d9b446528d..e65fe08df1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a387aca1cdaa99c3968be17fdd518dfc3bf0145210b9b318fba45d9bcb67710c -size 590448 +oid sha256:1756eb25b5f0f1042d0fc1c49579a858bdd38d6e93bf5a9ca212d37a549f94b9 +size 572227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5dc1ed04b..4b09e4bb01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e361c71b778145b084f6e3e736582714b0375c824ddf160fbc17da1f95193731 -size 510816 +oid sha256:9fe56f2fbcac377578ef8e764a4fee8a466b72ab0e625253a7568620a3fb7499 +size 489537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9f12c667a7..1a72fda28f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5608500eb32a7cf16cdd085f2106244d4d8b407305321464116ba1d05a1a442c -size 660213 +oid sha256:518dd6001b63795c6be698f8cf0186b43eded371e909cddc67927bf85d793d3b +size 628820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbf0bf7450..d2461accbd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a2fdb56e62f3973fbd502e6a6d5a71d73461887cd6d08fbde6fa4edf4964211 -size 634311 +oid sha256:ac662a82f16e2ce7cb10ede9dcf938258d5d8b9ff2a665b6204e52d5cf7cde97 +size 593397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 32dcce9a46..a0df6d6c2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68f2e00a580d037021bd3861c78e2cdecac67145ec270f927af00af931d7028d -size 653153 +oid sha256:9d75eb64fe993e908d291301e606a70a8c65f21d977a0f6e01d9f1b3f7da34f5 +size 621266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index da0962cec2..6fcf972937 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d22138b724e8c1fd20cfc294cbe97de86fb230e4b7c9d4a8b9dd697990caca0 -size 627251 +oid sha256:717187f2f473d877886e1ff7eb573053b0525ef233803394fd514a51dfeb221f +size 597633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5882871a03..d9279cb6b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84b37596080527c171103dbeeeea906a7153a924cba34e66ff797ede548c47fa -size 662759 +oid sha256:ff320f2c824011e491f31213a92b98a3d6af3af0a831e36df9803a7cae40c2cd +size 631514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 57054e13fa..e368fd2b45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e96b48c8ce10c635b9899f13717daa3629948cbffa4d802c1128c6c6936a64ef -size 558262 +oid sha256:5a890670a3104b187c1665786e48adab7754a0eb5e7aa195d3bee5474f195d73 +size 533923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 34091372d8..ce62669d32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcfb9d6db75500ca834f33447c3e79644b5cd4655c013fdd094f279c294c09c0 -size 628569 +oid sha256:6fa0271b87d9101e09c0224e12066e99cce63777350b388e9841bdd8fece4df9 +size 601171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f86ef65b36..cc905f5c6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37e235a8a0734c0586955c577701afe174af2a31589707f9e6856e32662dc508 -size 530978 +oid sha256:07b573389cf995780ba74bda31a150eafc9f39073b4ddcaa1a0d9422fd80af47 +size 508465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 93b2d46ca1..8afafcb470 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a453850687de8d651dc37ce03473855b4796f361a808c61f8b2677d764a90b1 -size 651333 +oid sha256:2d1743701e56f9083011de9c2c59775ce6faa0532522135ee55912547211e1a9 +size 621716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2c2c44d35e..ffc1c7d6e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae65dd742831c4d30cf75398e11dc346a1ee22ea27afce4dac4d429a4cfa911a -size 624593 +oid sha256:80ce467248cea5f5900f969c70b364ccfef578aab8992ef60bc73096ce334ecf +size 587033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e25b63029..8bd2437f59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65d2844e3c3f67e314ce9e7ad8e99dd6ff8e0da70f4a20651716c94fa8333a42 -size 644273 +oid sha256:8b9791f710ddc5f19828a5413a1503fadadf74860ac841ac18e83a561ce6de30 +size 614113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 176c6c2604..2a9e1dcfe6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3f079c4e50a9625320e10a319e2fb463b0ef114335a349621a9c2cfead11763 -size 617530 +oid sha256:4f6aeff80a5b12f6a199798054fff9db7e38c9385f00c45212a5db9f851a3791 +size 590479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4276a3d7e5..e0a15ef6a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93543c28b554fd1ee883c3fab1bd1d22a3c5f2c954d40cf653c9d835cc87c666 -size 653039 +oid sha256:b203685137fc980aa89e0ccedbc2653e767c9d38922e749e771fb779e91f5f1d +size 623572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bde7bdf426..6785112dfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52c681f4df6f1fdb8372d035f9bba845192694b792a5caad0d714f0679592d48 -size 549382 +oid sha256:0b6a0020e2c489bde241fbb09482859c29b00f174626bacfdedff70f6202e2e5 +size 526771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d7208acefa..d03c1a69ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9341143e937b80a61b222a5901d7acd03a5a5e1d26c0e6e35297ce8389fffccb -size 619639 +oid sha256:a152067affbe462985efe3341e846dda71edfe0e3954de43c6e231795c53de40 +size 594017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3da122ca85..dbfb4606c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d933d3e0f3795653a6e9bd605b02d7ee7f27c09a2b1a958a5e68ea2b0cea5bb -size 522098 +oid sha256:e64c3993e30b5e4cb25a815a11f51303874ed7de0a09306ed670b8d8fe049254 +size 501313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 80fc66a3fd..e497364219 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf928bc38b6a8bbbb8253edbe935a65cd955f86686e0486b031b3be2f8f5bf0b -size 858241 +oid sha256:f6a2faca032025dbd1fdc20b90015175209cd62e2548a07f87ab07c08927832e +size 803612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index fcb158fe4e..498fdc1de1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4b3cb1fe0b137507089f73aace7082d49538fe072d6bb8264b4e611ca1cf409 -size 823657 +oid sha256:28f2e5057c3b59a73c75dd2c32b3efce8d3b0f17da9a968bf91495f2a76f8607 +size 776330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3d81b7adf5..eaa3f0514c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b52e2ab0e057fb8574ac0d6106e5086733c2f2ae1eacc2ca2d0d08d6e7f59a01 -size 742159 +oid sha256:2e09ff035ee90650e2abd68cd30949cd631871d757d4bb18699e0a4fb775dc6f +size 707364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1479316090..54d6642565 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f1d894c283a14816d0bfb73203f2c15c2d1abcc0f8ad73be21e08ab12b7b6f4 -size 809053 +oid sha256:74dac69f92ac67b21595957f1238dda59bc8139b203e92a7f15082d70ab88558 +size 757188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index b5c7adab6d..f2d5b0740e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc31b8e6c86c9b0f0145429df461a6b79be2e297db20e6e3d8e9ab080c9bdc99 -size 776195 +oid sha256:0f809a722ab2058510c68e973fed934849b8acfa56db72b689e2653ad805251d +size 733260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 768981153d..5b5717b148 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3488c96bc3f216f9ed532bbf59d1577a5a7a3e81ed7772acd501387d364b0d5 -size 638321 +oid sha256:33ea25be7555aac73c570df956109038716b850f03de08b52a653bb5f0bdbd5c +size 604215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 060a6cb512..f41bb17854 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5615fb72f1eeddda09f36876481e0eae3331b91a1e7a46b117dd96d8a627ed7e -size 849059 +oid sha256:3173e766d4766a08e6048198e83488f33a05adccc02d3515579a38495b998c6e +size 791914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 1505e74e1e..ffc316bf1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31313e4b1f37de6e10fae6c4a3ab821a87ee60148179f37adbf5207a1b52992b -size 815263 +oid sha256:b5b032d3834eb3fc3728bd39c7d142a4b3eb30f8014010d31340c6316d50ed86 +size 766260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a330c80b9..1bf3c3623c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8f0625512125b896cb0c7dc90dc3b1c8016c12635391ac66f13c9f63bb5a279 -size 802881 +oid sha256:9be7c923202a590948349fae4995fca73a8fe6b97ec44f06c77112d145d4c5bd +size 752052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 7c04b3fe48..6c15d2ff07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89c22219a9c155010d5936a9c8302285e8f7eefbef8ac58cd40ef3d7b4ca7f8b -size 770023 +oid sha256:3ac79dabcafa222864d68ecf26ebbfe1db2e8fffba24c683ab8703e78e92f342 +size 728122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f188415cda..ecade3ff56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7429aac8726a3edf6930065b34cfcb6e03da50de3fa277d0a1c505998c99737 -size 857087 +oid sha256:51dafb15d29e459fe4a5b4f94fac4a70e87877875074527b771d9ce5d6336d41 +size 800780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 82d8c86400..f21d242063 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:debe5cb2b1c24874d9d084a89a863a2a9dfa6e8c546a26a7bbfab7ac9070d97f -size 761125 +oid sha256:e70551e0a98c87e72ca7244db00ad88a925b67aaa590209d63cea597d4bf0d9f +size 709210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ff31a6bb0d..68b6e55f75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb0893148e2c17d105409afbf2e7336a568176719024dda001eed45075007104 -size 823341 +oid sha256:a30b2290de7c69839aeb7d479ed5b7ffcdfb7ccdb9d225a2f66ff911bcce42b5 +size 779664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a8c9cecc7b..48e07799d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fa1de28d522d9fa119e12d888c9ddee743731a8fa560dd75ad097f87e5f3e9c -size 726589 +oid sha256:7a445833cfc219e72b4f238d2d14e1c702f44372a16162c3fc26581370739d56 +size 682074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 3b30772e1a..07429bdd21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62b3a78fb12619208db9804b0f401b5fc4e17134433577deaaa63db51c2321b0 -size 688169 +oid sha256:51201c4970d73ae5d2e869b42ff75a271b4038173150dd175e87ff3472744d3f +size 651252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5f57a5e28e..4db7fcd474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:975aaa6a646cd6e79d52086ba591260213a8841b02c24285e7026761d93607a1 -size 600248 +oid sha256:52cdb6fba3337083344a5cc70e5cf53bf6a4694d71a15b85a9ffd8dd955feaa9 +size 568215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 66243f7852..2bee882ba3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4d6857cf81a082443512e143925fcdb7ef7e88a7156393d2dc68080fec24585 -size 818161 +oid sha256:5f9a7466818068815d9a7e3349d7c2e33209c523e421679ec712934c897585b6 +size 764914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 57bf07784e..5b6137f96a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e3b40249d7d04cd0f1c88778a1da50e3a2c3d168f23171c07617eaea94e06a4 -size 721557 +oid sha256:a724f4d06d9abf178c9ee8c42a9672a9ae46f447442336b1708f55e1f075213a +size 671912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index e23f0dbb56..00bdb0158e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f98a7acf6aca6caa8df439b770f08baf9aacb6f4c9daade7bfa9169175e9e75 -size 784563 +oid sha256:5423c6dd1bf4bc0d2d4eb593e0068a40ee7ba84fddd47345c55b44c7aea78ab5 +size 745424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 85de95db66..a2b52d8507 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8667b6266d3e9e74e8d775c5c4f2edd75840033b7372d94dc4d4c3a24e25ab41 -size 688749 +oid sha256:8bea52f9deb623be9ac41b0c22fe2bb946a1970ba375b81e0f78066aa2a40a38 +size 648132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3c13cf853b..d2e22b2d5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5baa138c205ad494ccc8494e97a23842af3d0fad69a47819071235b014c8a18d -size 828493 +oid sha256:38264d1fe345e1917e338e82198af825227118c2df833308ba9fcfbeebf4eab3 +size 782942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 96de43e4af..e3b8c16790 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28cbfa47e5deb2739e2262b8e3f645711a2999637c278e8a855ad89fc90f5ee9 -size 807673 +oid sha256:902be041e428bccc53902ecc4664f67f58ca174d60d1bb6eb110926d4e5df73a +size 766018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bec05d2a02..1e1d4d3e41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b5579881d0f253e006935465f35affda3dbacbaea405dc9a22d817457178d24 -size 712411 +oid sha256:6b79ed64a53e8025c1bc0f957bee5890b932ba613e3fc4ea1351ba6ba5667226 +size 686742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 116da4f491..1e7bc2ef6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1de05659338f0ba3c97adb369a2aa5986c31dcd07738eae0e505e095e68341f3 -size 779305 +oid sha256:db32f9fb71fdfb1039fd29ac36a023812292fee82fca07c1a21096eac0c93d3d +size 736566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f9b79a8ea9..21261d1741 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:395976c7f85434025504f67c93d600b50011a188d7aaf314953bbb0fe6963802 -size 760211 +oid sha256:dbc7c01a0630a439c2b7904c008a1478244cfd22d25208735be43c067e4751e7 +size 722948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9e8f285956..cb14debd9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d8308413d0edc76185aadeae53987d97b94626faf17fe35807a04891a312930 -size 608572 +oid sha256:628d610e48b66a56485cbe19beac1e7ea60e852db83e390b987f8652728ce7c0 +size 583593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7b642d96e3..12e47822b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eeaab50e4df10dd2afdb358aa0c49425cf68cb27ec807ae0d8d488dfa92c0c81 -size 819311 +oid sha256:dc5d1e0ce351e73ac89f5283c84f7ba6ff8402e5ddd7ddece3175734a97b30c4 +size 771294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 447c79320e..ed4b4ed567 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56201b42c29f5dbd5cb6c8ed83ec44b140643a22869accf29cc6c3c0ebb00651 -size 799231 +oid sha256:770b9d9eb409ba6a94b960b1093f811dcb818bf8d68d1fcb80fd106b304364d6 +size 755160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1a581db67c..80eaa5ec7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a7a7a9f87e35c80f774bd40889354be4283b21b27dbd42eab0ee3b6abb6febe -size 773133 +oid sha256:088389c88af009f77270eadb4632915e4f4ae79f80f891d465aec966f9157ff8 +size 731380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 9ad97fcbd2..3f88a493ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c12193e1fc8f7eeb488df7618ca0b6d4e8e765edc226b62a1d0f426c16b0673 -size 753249 +oid sha256:12ffcc6e0a37d8f1bcadb4ff8db351b133806642726f5e85265bacdefaf83349 +size 717022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3d297d4ce2..09154a9aca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a087c049798307479f1ab5253090da1fe1aae064d487bf25a21c160a8cf94056 -size 826303 +oid sha256:6ae46d81d27ff64fd603696713e646b09b2d6f8dbb952e96b03d0f45d48128ad +size 780948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b6a4942a6e..ae8108b86d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd21a0c40cf85617bb9a2a97e6b487e24eea474d419d0fa222d290a30d624a1a -size 731427 +oid sha256:7a8d440795aaf06ab5898eb05d79a9622c88b441d258f470471f4d1bb65af2e0 +size 688540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index f87e26ece4..aea504a901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df979bcbb7ae30e530fc1053517b7cf2091784bcce2cb140cbd95198a88b1856 -size 808689 +oid sha256:12f04f413a6beacaed1f947122e3627d66b234a9524f487900406cf8bea4dab5 +size 769452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index cd4529bd88..8cccf0a481 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a50626c052fd75c55c4e6e9e5ec5edac7fe45121bea1b4279d6fc2b3793e6ffe -size 712185 +oid sha256:ae367f31798790a174cb234d7f904feefbc3625cf6d0c845e0daae742162494d +size 671764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index ade2e39df6..b11e5af654 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edfd0ba55f95029bf26e794206e332d6d51d3518ae5089a8af1b97d41507dde7 -size 657287 +oid sha256:770c0e3315d62e391ed8ca6978cfe7b6b6c9a93318eed02e06aec02996cfaaba +size 631370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d41bcb3f23..f7c0798dd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:347b928ee94ca14ce11d29072746ed43d627b6701f7aed845bf0d634e8560899 -size 570500 +oid sha256:0a3f815279731751218680e773a09192fd1a59ca3aeb8d26a1781599f65f82b4 +size 548333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1eabff9f7e..ad4e0a00d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5741e8a68d3e48440c3ba6e16b6e9d126b9083e2f8d44cfaacb7c58a9c26691 -size 787377 +oid sha256:743bdcddcce414e7bfd0dfacba02be43e3de9c129dd7c9f2ca832994e6f70d55 +size 745032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 94411e7759..e4cdb75b14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c4f50fdf71fb9043a75973f878722e640dadfb74843cf8786ec7433f05b514f -size 691809 +oid sha256:d5ab1ed4ca12ffb677b62017ac7916d1a0593e3a9b097c44ba19269d4ca627dd +size 651290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 781cf6c20f..30f99d9e78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32e1b306b75b6fd104736ed7d1c881ee03260d1d060f0b5c741732c835041cd4 -size 769911 +oid sha256:57892dcf4e1b709030456ff78e3013a543f832ef56ffcdc55adefe6c518c1154 +size 735212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e6774f3801..967eca78ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd52265b6937e998d1839c245f0f7df96fe43f3530db18a65117b021f525284d -size 673555 +oid sha256:63a1071e8c3c74a67eba5ce35c09102198454afbf63fa117543e2c95b1b4ef7c +size 637820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 04e4b14a42..47030ac598 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58cff8957dc1580d88ecce6d1fef301d6e21da9825396a171b09cb81ce9b400e -size 753703 +oid sha256:1ad1cbacc227bbe4fa9f21d2f02e76fd3946f19fad627168abe11399d0e93019 +size 709288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eca3a93012..285a4cd3fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64c44bd5e1c79cd5036221ba83ae7b01d10af31168001a11b1aa86c645d5118e -size 716209 +oid sha256:97dc20c0e70ef52db1d82e63f9970a8918873324ef92886f0ee8d94b688c8405 +size 669078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 17d43370b1..fec8a5e0f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17d63cce920d8092347150333854b5a3887be3c23912c496991c1d78c92469fc -size 743387 +oid sha256:43c18eaeb339df7ea015ef8523f8c39c62829e3e07e945d6a88346774acff561 +size 696208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0786e823df..fe9edb8b41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa5f188ff8866e5896407b821841a709b3d1b2e5d3f320a0c88de0f0295a6b4d -size 710973 +oid sha256:bb4f113983e7fa6217e0ac9b085b9b8acd5a439163a3d7a1c94a02f723650cfa +size 667494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a98e324b56..fce4d57ed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:779e802e4a8b97fce1e913b9ad4d50e707dae8f90f775a4c61aa043df7722417 -size 753091 +oid sha256:94037a42edcfb308561053b79779b474aa61fe35ac0848d9ded66708f094ed6f +size 707344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5ec67b89a7..5a3bbe316f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:550ceb7b1d8db98be11847c6e03c4ee0ad2fbc475083550e9f14c58e82c9ac8c -size 662311 +oid sha256:5ac6398e3116cbafef77928112dce818824120fe6603c1c90d1056917998e48b +size 613847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f4b20edce6..d272a824ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0404d09dc397fe872e695c0159ed67a8b07b2ee2067f8c761e6e0c0baded5a79 -size 723637 +oid sha256:8dd765d8da0bf4f33d675bb019590083ebff30750d323b67e59dc9527540c49a +size 678580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 90d2abf4d0..50019c1154 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5416cc1b2b4daa1e7031921c79013fbfc2e0d6ba1fe257ceb25b327bebe3e1d4 -size 636753 +oid sha256:cc7bc583520531b53c800f2ea94860045788ce7febe06df3349cdf2e667b9637 +size 589821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 63e6fd5111..ec00e04fb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e3782f7c5fd0a685861e4cd4bdcd0d184538bc4a2cc0619dce0c8144060a496 -size 920747 +oid sha256:c177c9c521455b9eb439873825fd464b1f8e961c149b5087263be4d8d7f39c35 +size 857238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 5469d7346c..bde0f2a0da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e80d2fb2acbf1ef9c2b0ef0f3177255cc54ad9ede86f6b546fa2f164196f921 -size 874173 +oid sha256:5ce0f29d9bf150bef8f19fa2f21b20d997dd3e77e0c0b5a5fd48b35e83213a21 +size 822802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 37dcd417af..a8fde9ee59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42bfd2672b6df7269c238566cd918d162375fdbb8be7825e77df985237cfe29f -size 789183 +oid sha256:13161ba301375cd26a2892365c29be8dfeb4ae221403142fef28308fc8ca0aed +size 751968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index b38c66c4ac..c987cb63ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c6de87a341b1b325e5b75e331a33e7df841ba451a1ab9447fbf6a77c60bd5dd -size 804567 +oid sha256:59e63e5f0e58d9f5e3a9f8956020060d191d7e99a218f0ddd8830c5aa010974b +size 765330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a77472ad65..b949aa45af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa07fe841f881e930e05c3f8085d42a6863d4b1312c8510d2f0756e01ad2c373 -size 875703 +oid sha256:cb433904e47a41b020cb5b23ab14461b22fcab5cbe41e64b17d872089377efb3 +size 817178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 411cf8371d..e0355578d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc4fb569626909c59deb4eddfe6a0f337a26ac1d33b431a89dc4b7aa88c412fe -size 830857 +oid sha256:6c6c2eea2903e5999f289a1a2d03a5795376de88c17388ca79849dbe544a8200 +size 783728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 99bbc9843f..18af8add2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49954a147b8ddfd0484e8388e879f7d174ecdaee6ea343853e050e2f7e1a318b -size 653771 +oid sha256:97c7f72bad36fef6deecf7bdcdafc11235523ef90ad6ba43a1e1c8894c4756ce +size 615421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 29bf82888c..cbbcf411ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3f7f73e5025d0def1dbe201a9afc5e70da11c4d7cd033c8a5282a813b6385ee -size 687111 +oid sha256:7bd3589ed455ce7970e8d45d850e53bc5f734d89fd897c1e2dc6ef77f0480fc3 +size 647924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cf08ebad01..1278f65043 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb73c7ff2d7b9652ba346b5c52a37e5d352eb0c52d3b7a0a219df08dcfbd98b0 -size 905793 +oid sha256:e77bd3b31d0d57c0a224a270a0afbbd93091e043fc96d43e3f248d410bbaf349 +size 842234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 5e59f64842..62b0bff82e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:910d49ab9c31a998ddbd90f7a1657fbf6a04d5062362e7a08da67345cb0c4a7a -size 860009 +oid sha256:310b41e54929c6602c55437546e16eb7858af3b8c129ba96402bd9b8f091fd5f +size 807058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 356fce1671..3d98d05852 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdd1628e9521728514ea9d0067ec7bd0c305f4fdcc39af29cfcedb8d3cd92828 -size 866817 +oid sha256:4914cdc7301363a6273ef90dd3408fead54fa922857e75eb8bd2a97f7e846f6e +size 807156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 384e783b57..d8c28960af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41d6bd6c610ab9ac42ce98a3a86125918628ff2d0f8313feda13368a8753a74f -size 821971 +oid sha256:270abf3cf6b9a724dbf08b135c3d0b9ab3b574b6642f0898beccaea7f65bdad0 +size 774200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 72ac669ffb..501c36ea2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b350a4745f03af27a4e12bea8ac22706ed0d088ce26cf9c66c8064cc4fc1891 -size 917125 +oid sha256:c69739743980626e301ff98db23044678d4569c68d904b616c741b4f59871a09 +size 855590 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e888eb1f2b..e3808c2d78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5d46a3e6a5b0f686d8dcf322f62834d09bd4404db928caaf975884d3e5efabb -size 819487 +oid sha256:8a71393b897b1687ec532c1f1f45ad88d3a8470549125d4e216b48310299201d +size 758050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index e13e325c04..9fdcee2d1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77826f2847027a91cd9e9728195b2013e127e8aaf9d9948e3a8b73966bbceea0 -size 869763 +oid sha256:cdf1ba12a3a1f9d24c5235a31ca657b8ab695ea7032b39d86fe62fe9c3ba0b1a +size 824854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index bff990ec38..871cfe11d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c367db855dfc5eddb0c4d45d1867b54bed6a97a9fa4985fa9f687e1ebb383343 -size 772865 +oid sha256:0662e8134cb43001ffbc77cfadab813ac2941409260de46b7095b0466967aabe +size 723070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 007f2b33c2..713aaf86c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:713e670f97ab723c1a48961b8b344a081f1a268ca1188c843306e23a5bcfe0f5 -size 702187 +oid sha256:80362b7f87140b588b2e300e8285f7d2e47708b474ed879a695590871fe20793 +size 667440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 013303d556..74d9920109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bd161dce06761f3ebeb99f0069d4b1771d8da7934b0440304f70995f5bc054b -size 749441 +oid sha256:4d0cdd471220642e0945e82d45026c40306d2dee997b0df3954e8aeb29929cae +size 708724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 77dd8cbb41..63be37c32a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8475f1503b3eac571d88f2d4a2d15397ea1ae05fa54af535ed6ba7527447e540 -size 615006 +oid sha256:3a9c46bf3325d46fc91c7490fe70f68ab93077d31f604e0bd4a2d67f08fc1e04 +size 576855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7928a4208d..64fe6c6c38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7b60a5dc904f502ce0013987ccc2906c15f1e96f5e650a8f67c06d7a1ff2ab6 -size 648843 +oid sha256:c0098390ee85ddeae78e2d64b118d45f2f89c70f879689b868b17ee0cbbb75bc +size 612713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 25f999e089..b09b43ebd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65c245bedfeccce1c5efba85a6dcc2ea32bff8cc2c928eb1da74749b9ec26cf1 -size 872377 +oid sha256:1c0e3198b1a3ce0eac607bbfb7638ff873f1f3f1740ad10adddecd7633c1c4d1 +size 812322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4e7abd63c6..a99d4e4c8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3665e48d9904aa1242504e1bef50cd0aca9947638c6145212a53c17d5e3ecf21 -size 776713 +oid sha256:5244b19393f4445014c860d0e2a50188969cfa287254dcad17956f0dacbd7e0f +size 720258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index ebddab9242..753dd794c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef9a3267cc6b5629ca0c294bb32092ff564e3077efffcacbe37c68341f68003e -size 826791 +oid sha256:a72c0ce14afb3a14d7a2633c1bef66c985cc3a1e954cee2e7a01fb9ec3b9ddcb +size 784102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2d29af53bb..cb6470b026 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5638cf098f53811590f38ef42089308a87f66fda560eab5dfade21b76411ed68 -size 730239 +oid sha256:5d4b907209b897d1e34fc8fa86c5bfaf91023bdc959bab3759f2094dbc359460 +size 687006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 248b017cee..fb51f90e1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a6b982192393dd77d483687665f8fc7963282bc56e16fb2aa99ad7442392349 -size 885325 +oid sha256:8c9b6a8f76b4bcda91d8c589fba11b22f2cd8c80a7ec0901acc09116528a57f6 +size 833410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4cc65bec37..85edab234b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6ba1121ff75914523ce3c8b9eea6210eef1d74727e9cc8e372e0a5fb5183ca7 -size 855329 +oid sha256:92275ba7d330aef11995be305819fb6ccf099f5603fee4a9a3522fec79de6dbf +size 809728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index fb16e5eb80..78e986055a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6d4a51200e08982a70a4c4ddf7760d44c2de6facc04eccf74114c3ec952344c -size 758547 +oid sha256:5004d673d42aeee23f6cd93ee7ca85c8afc9b9ae4e1a4b7e398150427d56c167 +size 734406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 43fae8407e..533637475e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e250f8599560663a0a70ec73e0cc02a9a94760c51397f399cf9476c9e4461265 -size 769983 +oid sha256:a8c753cd3aa78efd233ddf628c80cbe32ae78563ad9eab92a4161af9fbeb50b2 +size 742292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d02472f96c..945fc6f67c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:258924f29263fdffc3a051f7b62d406895966e8e742f53fdea31e893b815c983 -size 841121 +oid sha256:d0db5787b0effd9eba85b371bd00d1a9a77cad0815499dc2f8fa69f775ea0284 +size 793350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 1129b8c0e2..378d9df6b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3feb6b1068debf95c653345eee226916e352a806fbed8b2e1080b6fe9984b04 -size 812011 +oid sha256:55e38762f9ac44df748bd04d71c8c4cb7a49c0f93032eb2bab12d95d01664513 +size 771444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index df640d7ad8..4cdbd4bc19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3de7f5a3d2341ad2402fdb91c0a3671bbfbb528c81b2ee357f3be327346dbf0e -size 623085 +oid sha256:1bb135f1f2493e3c19031f5195f028450ff2d3f4b702b322cf5cd1320481ce6a +size 596921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d01aece8b0..912ec556b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f305a95d498c394ba953176afa1d7b6ed36ad930d38f923eee23e01eea3e7b4b -size 652529 +oid sha256:2b1855d34f582327bc8c070f183e7a1884573f83a68a097a1f4fc0352fbde398 +size 624936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6c1771c6b..97974ec2d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:687677af0e1b79f1b24e1fb7b996bb3eb4be9a9d183eeaa0274309817524e522 -size 871209 +oid sha256:68865798c6304cba605443749344b22a3a7d7eea5928cad193b45199f79d017a +size 818406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index bad5311da7..b115d2eeb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01ba45f6ca1cd1bb2088e6299c903ad0290d5f93e0a8cffc1ebdc3523066fdd0 -size 841163 +oid sha256:5be2bcefa4f5e710f38224f28ca490981e898d98d06c7ea06a397c1b482c6375 +size 793984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index db64c33832..167f1cd2be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:853c6fef5b658398b2dc0f56a2c55f0ca7d95712e4571d98367605ae52bb9ad2 -size 832235 +oid sha256:34ffcfc46ae157cc61240dc8c2abc4ebf4dd6818398595c689a7f2cb4a89f6ee +size 783328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 70fdf2a308..539ed2e81f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b23b72bbf3aadd0d9d47518bffceee384300861957b8b500a1879373c49ffde -size 803125 +oid sha256:e148981d66e541e85105fe8055d1c8b08ceb07fffa4ff044a99eb58d08b9d597 +size 761126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4747d16886..a1e8ffa7a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:646dbfd039ce135b23acc57572c6bfb8ce8279d3d4eba5edfd1d07cd7ecea7ce -size 880667 +oid sha256:438202535e35c1f6c88f449d05ea42aa58a1a1d3c304faf3c7b283b659c1504c +size 831762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fc0334ff44..277af09112 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4450749ed30cd6891a4b6cb1755297a676ee0018cef044131429556e6a8b5963 -size 784855 +oid sha256:056886b50f86dd521ebc0f5f45775b081b19dd8a4b128d15c490c09dd5c087fd +size 734222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index fd210944ed..ce9085dd2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2326a5f6ff64bb9c892558e6fedcc1cdea2f686bf9ee943424804edf2d5d7d6a -size 853089 +oid sha256:c52cb70560a812b52cbc8875dc86dd0d7b0b53f9efec06db3407bcd7ebe98bea +size 813458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ab67003669..77e6c57af1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b19cc87ae6750a5b869f937c3db4eb94e21cf68620f0c3784bf203f155776690 -size 755647 +oid sha256:ff2c13e1ea48fa49b57e5181d0ba3e06b5123cc15e605bc746d59c8828c3e0cd +size 710786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index ff7641c01b..b805656f8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e29cec4ca7863f1ade0715bf3272a6c5ae6f62f8dcdb22f5386175333c70e29 -size 670615 +oid sha256:277d2f37b68aa4090825386525e59fb0b87e1fac7c2b94959b7b8f58b23a737d +size 648348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 865c4673b2..a8d81b8c14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e36377429ead5bb1b810f31c79423dc33c607c13be383114b2fbc0ffe5e2d88e -size 712885 +oid sha256:2434d69e568e619a5db7d8521232d47fa21b036fa891fa76e12bb1f08b07498a +size 685784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 56111eb742..14b8232bb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29757520edcef850b5858d437fbea38946ac30d0fe39d5f7e4deff36e8fb06cb -size 584322 +oid sha256:a25a2f3c81ca0c22d109fcf25721ad944696a50dced51bb9e87ef13884dfbe50 +size 559145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 571a4b0712..e8fc79b560 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17ad0bdb87ddb0d5a8e8fed1d16f8cc9c905c83999bb9e36720f78565427be4d -size 614210 +oid sha256:e69e2b4f5eff4e3551c00a9fc3da7587f1bb1a82578c0cc3cf879ae4f111aff0 +size 588935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e6a9ef0790..ec7cbb85da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b1053e6d242262e7b9dd2571ec2d9fb54c0d8e7d6309b01528a5a26d8585d28 -size 835969 +oid sha256:a66f712290e66623b1a78a0f901075b28dd2a3c5cd277c26d4bedbe12c43e8eb +size 789284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e47559ad8..ee5d4deea2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a650f5c97f903c96564b19dca7f3bc28916c80a506b508d33ac810d136eb8c6 -size 741291 +oid sha256:8fa1b79b317e47223e5b03fb93f93bd4ca02895c12950f1695badacd01a95a4e +size 696430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 4d178cb8bc..80824d93a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ba050d1ea72a4a542a58812ef562bf3e430d84bc91ce7ef71259bdbe1be24ea -size 810067 +oid sha256:577ebce2e1103477124be47d49a38a9a22b2284908011d5423b81b505c8be358 +size 771818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 55d95e2ad0..af767dcf76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47e9e9326a44566a8013b35a11a48323038df69d28a9d2416021da1ad5ea51d4 -size 713021 +oid sha256:a7d357363ac309434cc91c4ec6fd7752e4fb066a5e96c420f003c22f2b58498a +size 674722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f6f65471e2..9366014c54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acdb58ca951fa75431b12bec54d679eea6bce46458972c84d0fb0d1508901b80 -size 805651 +oid sha256:a5a820f700a27d6d7815b2b0418325bff9aa4b70abf71b9188bc2fcce9a0f679 +size 750678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 8f291b8545..f88b950d05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3098aa3b6129ab0eaea5a7b683c72549fe08601e0e0d9221b968928961a522c0 -size 828353 +oid sha256:d50b90f0e2b9f72b597bec15e0253ba6b1a757c22461bfe8c13bc2afb9728f7d +size 813882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c9c817f3de..d0aa647edf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40373e75d9593cde6c5c7861284880877f469fadfb3688bc9cd838d3d3cde693 -size 774569 +oid sha256:cc19821766c29cfbb29cdfa4d04c4e592174c60a73c9d2e5ed2417de834d33b9 +size 720040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index cb14b82873..95cf2956df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbd9f1de63951cf63d7ae53ecdd817bdaf36b8f5fe279d032c4838e19d3bda7e -size 691905 +oid sha256:86c5f287c1582915b93bb68bc5a02c9984644e4541435c4c36c418c831ca719c +size 680444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7f26888570..a8400af63b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cd6519abc9e146b08bf0cf08e38df2aa2af1fc389dab6b4045eb39a29b5b4cc -size 792029 +oid sha256:8de0d5bf220b70d2ddcb68f6d0c0f1932158ce8ab15233dac9035c0406d01516 +size 733504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 00bc5f4e6c..370f5ce1d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f04de7da084a4a36eea2f15f7a4d873f789364c446b9ae0a0366d20e13f627a8 -size 766967 +oid sha256:619ba9aff39fd5f6d840c3807e1fbb35747e79fbca78a14f8a900fa684c2813c +size 709872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1870c88335..8d53f13ca9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f766d7be6d1f1641668a6f1880def37aeef3359e37179b6f373e75a6284f000 -size 800699 +oid sha256:24dbd7d5259cc60c0486a2800d0688e9d16f389b0c2c169e7d59f3f88f237403 +size 745824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 49860d2469..c8e8eaaf84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48a9433c08de1b97f702c38833ac9429aa3e447f239d58fea0b91f48647a438b -size 709375 +oid sha256:a206c33acfdcc987b288f2d6c8ef35fed7a5215d8fad1ca5302cc1ecec6096e6 +size 654056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index a555c1c56c..9c53d75de9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94c51392867e2eb49f9f7bbe0624cc349fa1812806d2f135333d7b7f7b7f0b51 -size 770663 +oid sha256:a0b8db27ba273380ce9cf69c33ab8d5728545251c8d8c0ce67a30734703c8a8c +size 756734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 2e87bc55d5..b7e1281e4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8819f4cac9bb8d212de0c7fefafc3e838ac85497a79b6affd3116109a6e6b06c -size 652551 +oid sha256:2b6a7cbd3735e1dbfb4ce53582f13ba58b98dbf691123722147847606940ba0e +size 638770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cc07c558e9..20cafd1c79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:472b2bece97581369ef8eccf492019f1bf0f5ff3d82efe6694e578c6f5341525 -size 767939 +oid sha256:89d9584ab3bc3c7ad4e41ff421eff39852b611f151ee585f6472ef2fe7d587f7 +size 716074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 42d1630066..392a9bddc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eda3a8416635fb5c534c8a5e0d401886a6bbe537a70a3a8bc4752d070831873b -size 681993 +oid sha256:729c544ef12cda9509c5eab9a392b7d1beac003904fb2eb165eeb5b69d11623d +size 628104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 737ef3e71e..316fd233aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99e1513f5bca8c6d4c8101f505d55a3c24949a29a5f26dd96a55ff3df79a6bfb -size 1043931 +oid sha256:085ec99f86b41265286956dbe1e20dc68431081889748d229a5eb135d3a8e832 +size 966708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 56c263909b..75ba88141a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05187728acd2b180277e4a6f886b0c540fdd9956a08a9d2ed6a43101c459720e -size 973333 +oid sha256:d25fa42cd52e7498ff90e165000fac47b163ce48484f5ee44bb128d63dac0a87 +size 914808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1f0f417d52..8bc0ff118a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04375dc425a9ee56633ace11c38cc4aa8ae9a8211696bc981f2116ecb103b4a9 -size 929083 +oid sha256:e92e294dd1aa68eb26348ebf22bd8fff6a5f4d7cf8e1d50673140a30ee5fea46 +size 881904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 54947d5455..e8bfaf8e2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d4623b234d703a77a44042daf09964769ca7b44d5f279976434c4d36dca31fc -size 988331 +oid sha256:bb3136fe653b59b30f7653cc791c0613a4e6a6fc156934a853865a3a83d2336b +size 916190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 198bf06c86..3a378ef4b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f249dc035c6e83444d85bf83d2304a0d78dc9bd507928599cfa73ce8edd12706 -size 918671 +oid sha256:b02ca8f14bfafe33db300c60bc436451855ea793e5e9ce352a23d196d568d7cb +size 866064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index b09d587464..f163115b25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eaeede96a865981bb2658b73c60840f3db8e85617c2a58ffd31bee5e85a99f8 -size 785383 +oid sha256:e0be79a5123d8a8352596ac9bc9841b2a9dd62c791728d41afb907c01403fb42 +size 735688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 118976fa2b..6606c2bb7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a5488d80a04b36ed700eea952916a5d1e34654314337872df223f74828e13c4 -size 1021133 +oid sha256:1c565767379bcfe331e9f011add78fdc7784a60069f71d9950dd2c54e5c92c28 +size 943466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d488aa958f..2a63b47d45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d2a7fc79a635b71ee1e3a50722784627f8718265eb304d3f02dca4697e422e7 -size 950535 +oid sha256:9295ee786859f865ae0ba21211c6ea69184a773a4f2b365aab72ab98c4fd8293 +size 889988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c906668d38..595e2d8dd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd3665ab467109704fcb0ff0a6617cd87d3b849b6e56fc8745575ecb49100531 -size 973771 +oid sha256:198167e2938eaef22b81b8cf1a041acbaa715da75afbf5ba97ba81a1730149ca +size 902320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 46660b632a..57ff47106c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97a8f8d556034c130b8477c896a05e71ab785ca676125dd969bc3e4c46bb4050 -size 904901 +oid sha256:50440f1111e9f48126381d3914e8296bc373495283bc093f4ac10d80da732986 +size 851160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6b59950090..1a2e59bacc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4318501c05d87ec2667c946abc510ac69ca33df56609aa5e468818464666e729 -size 1041199 +oid sha256:c3c0fe6907670698af2e038b2dbdd892205f667fa887f36631b9e4a2d68aa51e +size 955884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 661973988a..8d5ef9c7db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d728f59f99846951a99bbcad88f149383df3a7dbbd5485382c8fb8df29284e66 -size 935223 +oid sha256:edd5ba1ccbdd3917e29ebc57e600c22cec26830f8a3c1a5b4bbc5058b509e082 +size 860170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index f9b1d90585..ab16012407 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c66f372ffae3489a57dab735e748a5569c888210e21e451100f171ffb3485d1 -size 970601 +oid sha256:9e57a4ec12c39930ef547f44ee6d5b2b0f16277c3edde883c486fba8f05c145a +size 908474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8b79137035..3652d4003c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68f0d8930924d5467362195d92a52d2aa0363ec471136b987e7bf859aa16761d -size 865413 +oid sha256:a129c094e5e7c1763a955d94bb8412e4732086f8e702b443fb2430fa1ef229a3 +size 807924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 33355b04d6..e9c8bbffa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59aefe753d6987b19373b97f09349f9d959abb0883ba9ebedde95c22da0bf5dc -size 842731 +oid sha256:f758213fa6ece307fae54bf173d4626de108937ddce3fa5d871c77328e389119 +size 791112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bf1a4c0c29..a820ec9c39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c559e7fd1352fb7dd3f7d5f4d8f76451cea8f9fecd0de35a8a0d90bf641c25c -size 746967 +oid sha256:0b051c7f93c692e2971c2f96d2be9a72cfd43c2a2dbf88aa03f5b4c1623da018 +size 700478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 460d5f2089..3648adba87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52ef5ba4eb9f75545a5a9b0c7c63d152e7c97b5bbdc897a72fd9bc4e69f6fb87 -size 981009 +oid sha256:8ecd4b3cbd4fa4e5445aa76a4b11df55d6b182109f8d9784af155f1986f35c41 +size 904428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7b5a05a1a3..cd98fe52c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:976068282c6641f4d2f2b5d10b3fbc293438c0c4597a9cbe2523418881fcc2ad -size 885345 +oid sha256:605fcdc6789342db0e4212f13bdad1d02190fbe0a196f48524c39bb9f88c54fd +size 816064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index ce581db3c3..0545472117 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8009788148e17a29d674a17ea264a3d50bdcec2d73bb450fbc5b85cb24fbb8d -size 912139 +oid sha256:e0b710ef021e4e17c8eb35944c870eb9f96b7fffca1abd2ddcbd112126c0409b +size 858744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 64bff293b1..3b638fa1c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3e6dd45192db6eb0bdbbd05ec82a8031466925cdd962ecc0f16df223cb52493 -size 815683 +oid sha256:17270b7db4ea3b80de93578491488dbe5218c33008b796e462685c97ee3abc8d +size 765594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4d5758a8be..ad07337d7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7377f091458799e7efba7f09496862a161128c232ee91d37f74cd2ea8e242883 -size 998003 +oid sha256:7a1adf8b341aeb30115dbadc5d99fd3fd5cb638c9694c1e9c1770dbdf2274b8a +size 936516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 338f166631..2bf9b4d415 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed3dbb09f6aeaeb1f92e64786dc9eb6624abac18e295c9fef894a17612de5e11 -size 948815 +oid sha256:9c68398137d2aae92ac234be77cca71662e8d1abc11bb8ccce6fe91700b2c5a3 +size 899366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index bec51f30e6..c00a4d47fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a78a348591c939e2484890c7e850865a37ba03acd733995e9a1b52576971b90 -size 883993 +oid sha256:902a8f63b7d1944cf7639a6a4b96c24c041c1bc5f534e7197d80580fe785decc +size 851712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1c8eecd92b..489d6409a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a96294fd86da696f77ddde526f865baa27b6e981a45b188dbceb2d36a8d9df9 -size 942401 +oid sha256:b653a7ae6c367b2366e0f921d35dcb8900b10f219806748719ab09e0b8691956 +size 885208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 78f3232d54..ee2fc97a22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27a20ca997786f2609dc74d1ddad17d3a527292bbe1e1b42ba5f4fe0fa05f080 -size 894941 +oid sha256:7c552f7ce8406267954de1321547199ca50c04f9dfe78c7430d051e0152d598d +size 849784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 222c3d5b92..1b2b2322ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aa6e80ecec5e76627175fd44a94cf94b6f9e9829f9e847a288f11f88fdd2e88 -size 740243 +oid sha256:3f50bab40a379800d1d47828351f98c9bd1ec6824f9a5a1fb6f3830734a18e49 +size 705496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 36d2e7b812..f6f4f58fee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6079ba1bfbbd93263caa109d1f3f8e81c6dc192d44017a8872b62d66c0c4f6e7 -size 976043 +oid sha256:6e8b8baf4ba7e7f48ea0df533587008111a73460fed58060fafcd34909c3c442 +size 913274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a771690061..6403ef44d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:611e3b9efd7b53893a61dff59720c105bf1dbf7be6f2677643a8cceeb3a707de -size 927595 +oid sha256:8a946507ce6f72192d4b17b7325b93603070d418bdd7ebd9fbdbd316873ab6e9 +size 874546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5e60171f4a..351b782e56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2d52fd2957ecab74ab22ba376577c5bfefd5996706366be568f34f2b91345ae -size 928631 +oid sha256:db4e82791793dde04f7c846d5f34084090be7a70aa2df67720ebf531e38e5491 +size 871388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 7d58212a40..12e050493a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73bc9d37b24c61c54b4d198baaff3fdbbc26484dfd7a84a40cbf87e84a276770 -size 880381 +oid sha256:93be0dee778889fdf8bb5c7cfaee6320dca3e5e406769645be3bce1c529b08e7 +size 834930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7bfe3e0882..e20c4536ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f2e5bbd1717c0fdf6cc605f2960e73d9af513ab6f1faf8fcfdf025fb03cd0b2 -size 995023 +oid sha256:90bc071822d7e2625d46063486ed8c7cdbb5cb6a51c08768a23a05cddf86929f +size 926532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 40f3fecea7..1c5f998d87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdde6ecc01feb64640376de87bc60a2034da78fd8e9b3e5888e58e46b043b6de -size 890871 +oid sha256:643aaec69b636b08ce923ce61e5ed8a2c81a196c247214e2b92b1b686bd14175 +size 829978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index b5f82e6111..50fb8164f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1661f5ba3ca8d2c5c62541f470e89690f4ac1821d82f87ef848dbf5271e290ed -size 948253 +oid sha256:6bd6ad268397924a508e4b2539151a00b23cc71fe9d365df8e75871c9142114f +size 894512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d93ef777b1..644800d9cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa399451471924026349ce0b417ead82d04c77f7a7dd4ce1eaaab8f950ccabe6 -size 842523 +oid sha256:631d58d6e137ae2869978dc2fa5328da1ff6275003575b0162ba20f081b37086 +size 792482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5d0cb1d223..442ba1ffc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2093ae1e418f24a29de397a16d04d3be4288e7f36855e84f244f2b0d52c8a53c -size 795765 +oid sha256:72bc66918a56e5e1ef78d672d179c11f5305a242f81a18b7f27dc399c957ccbb +size 761856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3a0e967de0..0e9a76f71f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f77355238eb1c6d1ccb35f2171b4364d653a3dd14451f3837d9888f60bc56ad9 -size 702665 +oid sha256:a49c5f958f900e50a7982457d38c13194f36f331cc9a4355de151574926c2461 +size 670336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 84e9661312..9f57b40f42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2388c42587d05e4fd546eea6714b2e72b7ab0afbaeb85297fe38459beb187c33 -size 934833 +oid sha256:cfff1f93cb4fce13cb1115c27c95837a8bf2a60d1122733498d524653603e127 +size 875026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 839de50c3b..bbd5c4342e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0aecc2bddfa397dbb566a3fd9c2be18f295a20b8082cf406d434686b89f00dc -size 840253 +oid sha256:1e0fadd1095a5922efb60bccfd588d6e88c9377dfc633310c3f88109a1883ca5 +size 785922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 9bcb28ad53..e7a0294533 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3b6f3a4c1afbe09f18a589d6acc5569351f84026f39ea5a65f8773e78ce4ac7 -size 889791 +oid sha256:c0162c408baaef68c2653030d4efa728b67864417bece5e14a00f59ee823f40a +size 844782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2a17596557..4cce11ce50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f6773dd38f44e57cab2c2c260162a887355d52df318292fae86d3acb7e81a9f -size 792793 +oid sha256:da4d44ffa1c613a3c9ebe9e80e40b54087f7346964851594e1840845744d0f08 +size 750152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 914af9ad52..5e03fe67a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bac33fc573bc3614e8349b4c01821e80740b11574c3defc3967dfd60b8013b2 -size 903875 +oid sha256:22f72abe8430a2042d62a78e9d7f7d1312d4241383efe0bda817510836e2c54f +size 829464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8eb90bdaea..4fe41c7ffe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41b6124bcf453a9331aa49166c1c048bb47e544e74065522040303a628c104d4 -size 864751 +oid sha256:5c9a4d358c0683db430743c08b61b1989ebef5ecc7f1d657374dfc4bf19295d6 +size 792462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a032609c26..6b2a2fcc0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ed7e75d5d098c2e444300231e34f3a261f7e45d8921d6782060522e891bc9f4 -size 880829 +oid sha256:fa6224d1cab72cf6da6492716fc04fb474d01158e49f7566bd2bfdea2d6090e7 +size 805530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 977aa8616e..a5d1a1409d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0693f0e879b550e3ae063ce16567b3a895c9ad6364ef7581f04804c8e6e5d2fb -size 852559 +oid sha256:fb753698a2a1def4a3ad11e0d4ba85f2619375b2d526400cb037daf57613991c +size 777952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 41aa207ee7..0661cbbab6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72b870e8b46300a295a1b31c75880a2d69eb1b19fffb883759bc41ed12610c38 -size 880963 +oid sha256:fd38d9265ad7bc93b57606b711d1b546b2a924035cfd29010e58bc5361f87282 +size 821008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index afc72f9e67..9cc23da299 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:084320cc522e4f6bc704deee23fbdee10122369ae036830ccef4bf4aa321af8a -size 803699 +oid sha256:71855fa3c794b817c28f5155fec49aa17ee0512f33f2913bd5436cd6d2ac77b5 +size 726280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7ffc59de75..6ffc6d4f8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ada3f7321be01053d3573eb5660e2eec21b0c0bb3d874b26a890974ba21ce7b0 -size 845491 +oid sha256:e372d2f136611b05258dff9a0ef5a4523bca4902d83b7e948b78debaa4270852 +size 788446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 64a426fbf9..90317ebcc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df85ea5314d837f281008df1a209716e9071d26eec887f691f34e2cd6ac0846e -size 772519 +oid sha256:c27265c2d015c0fefbc18ef11d851d1374516b0dbb34de3bda71a980c0337dc5 +size 697368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4d9433de9..e753c8d35e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebf7c02bd027dfd90a8195a6c93f14f87294ff1a9093c12f29a6e7c10e8fe733 -size 671717 +oid sha256:4afd3c610f5752efcad79c13a09b35930c5f7d371c1d0adff972925c653ac4f9 +size 636872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0dff276ac5..db0e06994c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7f60a61f2c8b083a432ee0d3c38d733c28d57ee9c23f5784b02b077f83be4e3 -size 624109 +oid sha256:001876f038ab01f25af3dc7317505dd11542432ec2fe95ec55b033f6a9e1b21c +size 593899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 165f5e6572..5bba10e259 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9d12fffee090eed5f7780f8eba9c77e8c6f745d84c27eaa98c5a8e93bf159d2 -size 663177 +oid sha256:4bd21880ea55752524b41f79895d264328ce619321a1255cccc2ca225ba4994b +size 626950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9c96343000..f22b5e9fff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f2c6e967a56e6357d056c85452c128f0683d537210115a583dbd805ce904bf4 -size 618627 +oid sha256:7c4037248963afe35e086c4ae90706f384ab844f527691459b760183fcfa859b +size 588565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f1a1c68fd1..d0a9780dd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a32ec042bb6cd4a3aaf7a2d7ef1d1fe783f7ace07cac5c15cc5e3cabf121422 -size 676679 +oid sha256:32d189d2cc5862ca2496ecece66d56bf79cc3b4c5deb4a9bbced4500c4fbec28 +size 642376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8eaf647867..8a2b9aca81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8984e7f633551c10f846e53723ec4e979361caf4fd4dd4784281ba11e11e99b -size 573070 +oid sha256:a03f1f3729949929edb719f6509dcb00f71dc232c4a9a41d2f6244ccba354663 +size 544539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8202ccbf6c..7301fd1787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edb3917fd49408f018c53dc005853a157b709355f2dba9c4caf406b2b33292e1 -size 610816 +oid sha256:967a1c6387793b8a874e3d1da8fd0956910017324a3c767981d21b185bdc163a +size 587811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a809852f29..6009f1567f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:165c8df2dd29513c8ff45242e31ceb0f23fa67320d3a0b6899a9c4c460846295 -size 530296 +oid sha256:021193db5c30368d75e0012e11d0af513f90da6aba7e87a933124e1c56585c4d +size 504725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 77f93e4d80..deb2907c54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d087f1cfb06602fc855fa57ebc3086f3a2b5124f40e423acfff1b1c97a990210 -size 662047 +oid sha256:85b081ad0088273f254b6a380680c4d073bd964c76f20ad9dfcf2bd9442b4e00 +size 629718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f6f9d68d81..2e01d396e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60497bbb70ec3fbd3b941f2884564153ec912bdeed11d5409fd07314eacf5ca1 -size 615228 +oid sha256:15031402fff8e3a3ca0eee2e6931249f92b10088fdfdf5cc6008b231ca2cdcf4 +size 586745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9bfa298b08..833747a7af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51a19e37508d4ad1a04685da3ab28266dd4296ff4a1836eacb16daf49e011225 -size 653507 +oid sha256:5a035cb7a07673ce8e69b9aa02e00223499ea037df7162f6d9c122b4486e2cfb +size 619796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3f07fe5d5c..9c07dedfa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a0ddbccaf4281d766489d3a6752b4fc4ef658495ab57da5ad18a9550dee1d40 -size 608956 +oid sha256:b482ea97ef0d23dc96c1106b9983891e6c2de16c072badc98d7efa0ec9bfc719 +size 582003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8feb0a8aaa..a1dc044b2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b241e07752e2e933de16d9b36ee6a751708ef2b536a566ac1dbca9b531db1e66 -size 666961 +oid sha256:fa6bcd98e3728fc59b6b03868b2712c6c25845d98c6cbcfdc835d4082b6dace4 +size 635224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d3bc984b5c..0aa42c99e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c020bb8db2ee0dba6081a17221104fde06a079d14592d5e60ac9aeb9dff3c869 -size 563402 +oid sha256:73ca612400113afb665d5eb480e99793ecf0ad2702017f7678fa7e7d76d1572c +size 537387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 99638e8b6b..12fdd450a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dda5f68a7a81ede47daa9e8952f3136e5d4fe8264f7a9caff5024603d2c8e0f6 -size 601888 +oid sha256:c8f2c7b29caf472dfa9214923cbbb44e894a9401db28eeb3fc9d27a3a6eb4798 +size 580559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9bd385ceee..c03bc6c835 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7f8733e716cfb317b6846df96b54c34419512ba82e8125530e13591628b6e82 -size 520578 +oid sha256:d95147a79f05bfed37608e94fb7c1a8eb330260c73e59b0c3f1a3ec0b0db6032 +size 497573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 699f662c05..5f61779b6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1553f15797916708caab1ad59c987c0645c7c747e292a04f4d7ef7be730da679 -size 690941 +oid sha256:91e29286057cdaf84e00b836016730aec4ad7f58b47858258b5707d6979f7928 +size 655554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a4231b8124..fa0b6a148f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:374b47151103c7c7cc7f58ca58f816522377dc42ba427d8e938c679df50f837f -size 641705 +oid sha256:161255db2350838a5ae4b535e8579cc1755d7cb79a65ddc2b0f27d4960987b02 +size 614159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5316864dfd..e869e1f7c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a22a2604ceadadf5fceb62d29d670791b4c9dd2cf0623d3bf276a274c39e170f -size 681563 +oid sha256:0e0e59d453c8b435db4076b06d104bceac33a24be3dbdce2704b1fdad245f8cb +size 645680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 36509a4c3d..3d94fb4602 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e7492506d8923fa1aac9b672deebf69bb698b7c11882c13075fbbfbc6c11f65 -size 636273 +oid sha256:c94690f3c7d9eb92d1e6bff1c2c160eadf97df2430988f4a776d1ffc21504ebe +size 607247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 294e49006d..fe041bf064 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fb7cd2b4b1ad1dec57e72da35719edd2d915ccdd3a9b3baa9a8f34dbbc62b93 -size 694917 +oid sha256:c0806b5e25c3db03032fc122f088ebaece65f49a94c2a2dab95d0ee3f755a3d9 +size 660368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 816ad9c093..8074649606 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:214097985f0def629a7c57cc850279e98e7258328237f507dd08e19578cf8fef -size 583564 +oid sha256:78e4677d938c20b735482fa4d98f58cb7609f124a068c47162250993c82dacc2 +size 556217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e258c4042b..5ce8203b84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9833362a9c22a5fd3072d1e6c335b4ff9a1259bffc972cc0a53297ea1bf36638 -size 644843 +oid sha256:419aef25443cec47c1acc7fa7604e096f681c5058af9f0bc7d44addb9cb05b1e +size 613499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 931e251b9d..6f7c5d0920 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1045a9fdd7dca5e37948000e9af2b394f1d9536524b9613150d9327d6d2bc5aa -size 540740 +oid sha256:9743f121b355fe0661a1971f702226386339b756c5af2ae1ebcccae9b3f8028f +size 516353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5e8a096e13..9aab080344 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:102a626aabe400babb192a68781f19fe2bed742340643a1a0141f5eb551b0337 -size 681223 +oid sha256:6df632422bb9edebec1d9c7d14e4971abe032dc20d4b46691aed30d5b33d94b7 +size 648400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5de6f7ed44..a131071079 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da129c568a6bd212f4518002bf191cff794dae0dc9fd6867be1237616c964105 -size 632035 +oid sha256:96379721cc8ce6afbfbf3248fe056604bd3e74684cf8d981e7b036eb16cacfa8 +size 607055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e243b2683e..03cc528391 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bffb6bd9ee861ef90c2340008866c06e8b6df3e40799e47a8ce7fc7796da8150 -size 671893 +oid sha256:b2590c15529b91bbb9874004b1aa513fd53cf7b9f5347351ba450dc6c07d25ab +size 638528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 39d24bfb38..d8ec111dc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f876cc94661af72a1407f12cca970c55f0bf184dffd802a35078606d58e08c31 -size 626553 +oid sha256:d3aabdd8349c330fdb4fd2740f7af249f8d037f4a3570a5b47869675c328ed94 +size 600093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d1916380d6..ec9a2da0a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36891ea31c6d8a6206ee4d2c54958eaf31045e1866a3ec2f77db0982cec808b1 -size 685149 +oid sha256:1ddc045ff56bbbc012bf07805db9e8d6095fbb3bbc9cfa3ff6a8a89ddb02bf5e +size 653214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a9b0e152e3..3f9b5bf993 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f863f795c0b8c2197c9931acfa4cd15b757f1e704e425419f2134b3ad3290a9 -size 573844 +oid sha256:bd42f33b36245a7180b9523abac41e2ccc9341a7c23637abe6e2ea49359c459a +size 549063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 135fefac96..02afe84d3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:115b6688f0ed7bc78b589a8cb47e990ae55a175b06f391a03cef677068a799f7 -size 635123 +oid sha256:a80fcec34bdc48329b8a739f544e076c02df08ea52636c4df6d67e88019662eb +size 606345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 692c91be8c..6e3fd4c9f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71aa77dc50bee0e6ca6234f4c4ad4af33990bff6facf314bb56fc4937c719bdf -size 531070 +oid sha256:dbc00f61505b54e70b56a0aad34ccec554ddffe8aac2fbec962ec707ca47b99e +size 509249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6fefad07dc..327184a9e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50edd8d003d0e4a05678b4722d94e75b2969b7db44a19cb1b6d13c5a127e610c -size 786911 +oid sha256:ea7c856f013eb1b723faf0528c096b4114321ec847c5b5661a6eec29d00cbcf9 +size 736180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8b9d11f4f7..90ed3a198d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b6e8b287e79e891b2f56ff1779ac67a78415ad41248a7825723af6eec53d0c9 -size 745173 +oid sha256:4a05d53ff193aa9ea3ea5d34552b19482ce649b6aa09be2ad861773242ca0a23 +size 696662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bf93e1b1fa..c3d79a54a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b06003227372167a5c64dc8836bc36e024373fa79ac54b71508d29f42cd6f800 -size 773437 +oid sha256:a27ae5edbbf8964c4c37f8f9e22424e2b3ab139ee1f37d28a22f5c559851b631 +size 719006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b7b58bc039..dba74fbf96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ea40070cb65044284ca7c609837ecfd6ece606ac6ad870f893cf7d6300c6a4c -size 737027 +oid sha256:8f472d5efca1d7fcb872dbeb2c3c742de85559ba0d8cdfbb27a21993f6ca5b3d +size 686788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 42f55a8a11..98ae95e606 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc0d99cc82eed7b40895a44e795e2f7153d82eab1b4aab20357cec9146f3a5a1 -size 806179 +oid sha256:b282cea5b53364a5440a181cbe26eed58db361545d5c67658a3996ba9be8a952 +size 752242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e733f5301d..6a123efb2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60b8c8b3e99011557bbf8d565f36488c80f931d1eefa27eff5ed7c230fc6c7ed -size 694087 +oid sha256:be0e10600c743f01df8d962de1b6b30c226654a60313278aa7e6eb2ad0e78ad1 +size 653764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1efc8fdcfa..31acc259bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:154099dc9ecea5a2c3cd7a1d6c104e9a80c1a5e49895e9146e81514380284d1d -size 724729 +oid sha256:f3535eea9b456aefe70c6d100367f3383b0c0dfbd759b5034a66eb3c1fb36198 +size 688402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ede0087496..977a8286bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1a25f4191283b365235a1fab085d75d32fbda1bb488d36f2d64858fc36bcfb4 -size 643173 +oid sha256:4bb43f9a5a5e382ea427e4b6f4ed3fe7ca66747df93909b7ce993ab238ffdecc +size 606105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c33dbd284e..964eae7fd7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60cc8e1c83bc0f20826824aefb0c1611624941e9d0307baddbbd2a58c605b9b7 -size 767571 +oid sha256:6f412ae950a2e94b0a18224fff3efc596ec2364044046e93d15ff297aa24bc82 +size 721872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 98456e6aee..db112eb15c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61027a0ea6d91076845fe9503bd97cd0489b4db9ecc0e3fc8c1358c1c0386226 -size 725785 +oid sha256:37b34516ba14ec1cbf67ca7fc03262a02339ca1d7531fdfda535b54d2608fc25 +size 682404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 906a7b441b..7b158e8643 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eb4e2ee0e84bbe1e069ccfb0bff3f93ec115221480e4f14594933822cb65f20 -size 754097 +oid sha256:f84ae23f00f2c23fdd9bb0da46c01856f815b07d38b9c8e07033151bb5726267 +size 705488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 610e3654bf..a9583a7a33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:961d9d2e5163cb7f62e02c497a4e5d7ecc11551f40989217480b2b6e4492f28f -size 717639 +oid sha256:3ed7371e139739ce214354453e893c55d3e64239aa442b3eec920fc8c657854e +size 673320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 23aab3e2f2..af0196fbdf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:403b2dee1fb99380ef61d56e4b207a532484418d7fa12d8dfa1b574177034472 -size 785953 +oid sha256:b16375ff2952321f8ef8be4f45f264fefa954cbb549c49a583c9027287a38653 +size 738724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ba3902aee4..a4ea435540 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f001ab93ee1fdff815d8d3066020b467c62d692b33be4598510daf7bdc9bb9f9 -size 675537 +oid sha256:515ebf97a8962eab084a7f213fcf8277daed5a79881303470526372699a13977 +size 640248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 866320b549..59519dc77e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0acf6009e96c23cca6d962bc76dfde8791e7e3288f54f1aafa535754adef8cb5 -size 704403 +oid sha256:f5235acb337c7a37c90951e3916402ae05c2b8569d4ec8901af283009e247c13 +size 675034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 086ef06b39..7232baab9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:256a76023a19d0fa5deacc8263760aa998078a4672bcb418d0fd014ae57bd56c -size 623833 +oid sha256:529e4adca55d454510421974b860d4fe1c5fa319b708196f68703167c2d66a48 +size 592589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 97621ac6c9..b59de490c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:229c2a01874b8db611031cdd4d394790ac19a5e7f76ed3a25c8c734fbe375c5e -size 806925 +oid sha256:92032c92d9e8add2f701b6a98212bf5a3e60e72756a89b0ef414c6fb98cefd1e +size 755058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a1ea6fceb..90c900c49d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b21f98a53066a5837ad200863ad1188552615c10923139ae79f4916b3a7e375 -size 763559 +oid sha256:e1045f42fc029e154d1e07e9178108b1b76e9c208249d65349ae6a659bfa33ee +size 716232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eedd43ca41..5b78a85e79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7953ab5af189efd55cff779ed65829316ca0c4606d501ad75eeef43677db5a65 -size 791871 +oid sha256:d0039645f076ba9c8c3735d7df96ef1e68bc9cbce6216e4f56ce43431aa4a1d7 +size 739464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 880f45e276..0a1ef0adcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ec3dcfd7b16720256dceccd858b0c0a87a6ad276193098085db760df155af5e -size 754623 +oid sha256:d295f5b0aca6f4e76fac39f4b8a36d3a5e10086dcf9095b47d0422f7dc463a43 +size 705570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8bc825db3c..957b6467d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d36fc0ecfccc538a021d43158cd11e21bee50ad69b8a03491409f062615dbb03 -size 823629 +oid sha256:53740ca7828d01b168eee25573edfe92e387e3166c8995c074e380150270b86e +size 771022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 421193902b..df2dc3c60a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a02c458201a8eeca746ac409d12ab43b2067e7f6d795af6acac3fc3bae88483f -size 702359 +oid sha256:c79b79565cfb842370abfe9e90370560b8658e4b8c24e5a58dc92f6a46572f6c +size 662038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cd73d75d39..fdc5a9b43b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e8f29f141719a0079f52ad7a1ba61b41cddfecb72bd0aeb4bd7739dc33877a8 -size 765265 +oid sha256:336654a205895ad7259f6b375d6ef26d69a4f85dfc51046ebe537d9b132b2957 +size 714880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a8b00a9eae..0d22ba024a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5864c3ac7dd1856c7275b18be12db10890748ed6babcdfc2e7ff4e0fa0017262 -size 651445 +oid sha256:6dea541d6a2dc8531ac34b08383f0551e6997a12b276a04c31fdeb040ec61611 +size 613539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4757b71b6b..508a208530 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fae445db527aebf301bc4318b923b266f255eebdd683da9ea9b86d04458017a -size 786747 +oid sha256:fe0c18af3091998f3d86afa9aed88b522578fc235bd57b39739cc92808207b6b +size 741542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ff7d83fb1c..dfa9928218 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b1fa2dfdeb872035ea7e61ddd7a9a632c5818382cd7dcfe8b644927a01f522b -size 743381 +oid sha256:eb9145808d5c3529597df6df2faf76af887f729918ded452ead34bfe6e8d7fa0 +size 701924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c13a682e5e..4bb879772b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9de65236ff300eccb29fdaebef12c0fa7c528af9f49e8df61c0ba8348c263324 -size 772483 +oid sha256:fa12d5acdb3a35013200a082021833849fbc731a8ee3b4ecc0157cc114c25d0a +size 725946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 75cf61d580..0fa85ad440 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74bb2b5c8efc048f2a59434c7d5a15cb73fbac4c41ba46f805ac2682ec741f55 -size 735235 +oid sha256:23bc85d22b792d3a5137aee50cfd900142464e35d2ee758cd1845fca69a92316 +size 692052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 88d720e93c..83392afe81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:731286bae90637ddbbee12bdc86d51f8710a4482e8de900640bc16adc3339c97 -size 804141 +oid sha256:5ff66c209d3796af1d3bc7693a59b0267df50d9e540acb73f57e19420eca6646 +size 757506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d8b2c3be77..75469c59b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:687a5577ce032a48415a0688c9b8cec402e6e1ea93d2df59d4f6afcca87f7655 -size 683809 +oid sha256:70f1ce01a223a367cbb5980534e69f97585fb909cd241d4bdf27e4bddbcbf372 +size 648520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fc41830003..33c9f99d85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b13c56c8af7465df13f26ec404274f65003a2222ad2207385dda7f5940b0fbba -size 744989 +oid sha256:c1a9dad9f94b5e7dca02a96ddd9d5dd6b92e3311a64b103431663a1bae50ac59 +size 701412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c6f68e7e16..7073edb746 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79951319ea6d9f938297c1d9b331a98eb3f3e4f19f66bdd7514740c899c5059e -size 632107 +oid sha256:bff5feaef70bea800af93ac6f0324a3c9f4f21b30896a080bda3b630dbda84b1 +size 600071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d0d3f4b6ef..0dba25a4d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e721f8f640c7de756d2018ea9030e892a54158f6b76996bd7908d9b0dc1a535d -size 649071 +oid sha256:029cbc881fedc44be3a3d1b3568f863d7634ddd3fa906c7810acabb3f379f150 +size 616395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cb2bf752b6..a5a10bb768 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af53530dc87895bf81a380972fb496494a2f23e0baa02faf9fa31e22457c3ea5 -size 623959 +oid sha256:f1067fe694ca5dfd28b67c26ef008f2b3bf2d3266924e880efe810b34c9258c6 +size 593897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f12f95f854..8b44fe3a88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f29eed027854e38c25d30d677522428de232223b7c38e748ede00fe0320ef2a4 -size 642799 +oid sha256:0b25f95fb8c47cd2ed908bccc81799d35720309d211be104fc49a4114b901dfb +size 609039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f1566f91a2..211cf3579b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8da4dd1e80520748c294831857bd997fecf12fef8f5d033a2f5ea3a5d8f19024 -size 617686 +oid sha256:2677593611d0104606ee55f439789c3f14d695df403d8b3de12f8f3d636f42e3 +size 587773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d6f8e9a353..796a9fe474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce87fa92200f303a2bf74df73274a9e7370ed2dadd1648e8bb9c0c47535cb7d4 -size 655169 +oid sha256:72090a81b276d09d3640475a09f5857c4abb3df20749077553e6fef802adbe0a +size 621902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9150c579d7..a1572e4ba6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:130dacdb40e2d0632715083386a7a6a628d1baac29ab4361677c12d66ffeecfa -size 557874 +oid sha256:b7c2c0a552ed9d11bdbf44eb0f7416e508e4988e8d98cbe7179185a09642c16d +size 530477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1e2f169395..fb8a30503b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5972d633bdd07052f65d1568c400169ae1f2a088b2597f07eb088e4b2a3c8811 -size 608200 +oid sha256:ea925c30a55aaca8d0c777616c7da98663065ea6f66ccd8a96ea81e8efae01f8 +size 587167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 78ff9f5d17..384ec7af9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3da8ef7209e75a3c922f68855f0ad2d7a73727522da8e364711b7e7cd4629a5a -size 528470 +oid sha256:75055dd124dbb7ae7200ce4bb116624cd22245fe4345f2fe9ecc3e028092ff6d +size 504823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec0d360b97..fb89716172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3383225031a38d834300f0380599dc9868863acda8f2a7831316d926c2254af -size 639401 +oid sha256:2d910194d35844ed271299c659645826717d4ee4921237cf4026f5bdae885520 +size 609241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 371e479c70..d42ab32b4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94fc8e9edee296e70ea29873d572af272c7f62105cbeb975924e719566937ec3 -size 614238 +oid sha256:7138a3c4114bdcde054a328812054bda5aebeeac88188978cf5d4b0305a6c012 +size 586743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 246cb7c926..50a7ad1242 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a49d772bee055b3365c118b3318a9463092ef40eee1541380306c48b72406d6f -size 633131 +oid sha256:b836692afaf2c3757c9b269b08743c5a1e345da888bce3faa0ede6388e79335e +size 601885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f41b1fe41b..ec59ec79df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49405711a2c16558a77c8f14737c27b00e28f0a86f28afb85ca7763823579fb5 -size 607968 +oid sha256:93c746172897f05061ff8b4e3aaa48c2915f8d170a1e1cb6625a99600ffd1e88 +size 580621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8527d6c66d..1ec0f84b3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66f2be359cbe881b0c28c439d0e7e1216aac2e5a2413cf3fc67717ab7cdf3107 -size 645449 +oid sha256:f21be13f6ff53190c3912c2136d5fc67f76acd799f9b146aa17b7ed43cc85069 +size 614747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 688caafbca..6263fce203 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1791f48e0f80bc6e6c5d3a2d82a8fba59bfcdfe1c395c3881938fedeeecce044 -size 548204 +oid sha256:ec5165160c3c595c329ab6d622f266358a8a8327cda9a145306c508f713cb6d7 +size 523325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 87b29a07c8..ed45a26d54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d253f8f238d3abaffcccf57fefef29d1649f772ea4645ed663ffc47515d5bce -size 598530 +oid sha256:c27823deb65ec0b3e6ba90731cd229af1f5ce5c83e0255fdd7c064d1c93cc691 +size 580113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 68419b1fd5..4370f9b3e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5bd2d3153c908726aa37abe9f486a5e60a03048e72313d85583d2a944cf43e3 -size 518800 +oid sha256:3b08d9974524656ac01a10bcfb92f7d6b27ee40b1b583e10f903d4eeb5dd3d08 +size 497669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4dac52f9e0..b24509dd6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cef29f205d11f725304269b3ef752768d5adbcc36324def8777d2edc32d0470 -size 668247 +oid sha256:1c7d054db900cf9716aced746c86f00bc6dbca53b10d6e7da6f7e98078a12a6b +size 636706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ae418fb0c..5f418077bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2263b2d4c744274ad5541c78413be3dd51947cd217ddbb037d2e5b159ad5b61d -size 640765 +oid sha256:3a5f35f9da65582a3f41e96fc5c098e2d21e8df1267c339eff6655ffb159e89f +size 614207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f29dc2964b..2c20ee1ee0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d89246b89796c229bd9bd5dbeb975e76011bf1f7a0c3dda6f40bcb73d3fc8622 -size 661185 +oid sha256:4449a9565d3c21110b2330b0a982bfa7a0c667a785239a98d459b9272d76c39c +size 627770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 143bf54e0a..3cf2c9f3bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4b40c1ad86370be3e04af830088e29c0877e0660155caa076dd2aeb3dbe8c61 -size 635283 +oid sha256:34fdc8a9663846da4a90f9a7feaf6a0c0c327838a67c949738868ea04b9beaa8 +size 606455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a7247582ee..dd869bc09f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9efe37b252c61591c40ca77dea513a79b077e748c7606ca79a3d32d998c7f44d -size 673407 +oid sha256:26b5d964cf7e2fd5bd2e71f072ba879b3cf81a46a38a875dd3d17be44466f063 +size 639104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index daad8f3177..0fc4337624 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dafeb2d9c85451f8fcc53cf965dc892789bd6a3b4d54758648211bcf23fd2d8a -size 568366 +oid sha256:3c27f1b55598623059bc8143caafd57f342f6269eda26864a9f00ca7d0fa78a9 +size 542253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 582787ca6d..5585ba7a4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbee6e7abca1d8083cba30f748c495695897e60359fa94ea3e202b877182e9ec -size 642275 +oid sha256:1530e475ec52c2b762595e5b9bf900f5c9941bd0e22b034324cf1bcfd06767be +size 611869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 13710191fc..9dbfc5731d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f127357eca5fbdebd5ad1e595020160271eb4d122b73eabc517b22588245622 -size 538962 +oid sha256:905ac985219465943d98ee463de96ce08d0cbd2abc751008acc4749b38b8baf5 +size 516549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c9fc9b4ef3..3e9172c1f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:717cdc5130bf13f48a77f2399d5eefea7738ac36afef44038b5ba01587da4bb5 -size 658577 +oid sha256:68cb082babb58df2ed7dd495026ea02b998a5ca8861c532a45877f7d6de52ce7 +size 629552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c21043166f..9127f4d65c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40db2af8e9f054abf97c6a487c675d32be0601529b53b72ebed282f301bfcef0 -size 631837 +oid sha256:7baaa929a63eb985f3128ecd1dc82bbf0b9d52dde97e81123855429af1489dfc +size 607053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 92479fafb9..2ae6075a9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f2be2d4b5fc038d9718997b2d938c3d56677ebb3b568a679903d76f3fbdfdad -size 651517 +oid sha256:478b42f439c2ec020bfa4004aad88bdbea0cf018e018ef0ba735a19a0fc8dcf3 +size 620618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f4a8ec9b22..34a1fa102f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d148c49657ead706709921d2623c640e950f39a8ab4b9f709c1c6014010b6d8b -size 625565 +oid sha256:a6dbda0e9ce6b8b037ef4590ab8ee1d8628b2d43a475b91e39da4ade03b62a1a +size 599351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7e75b603e5..819973a6d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdc730d6a54fc3ef8c5ed5b64512a51a80632d22b66fe9b1d06b4f7d84e5ea59 -size 663687 +oid sha256:645d5a8d9fc0e5a98d9aa01fe9058654e3fa81d02ffbe19ba4c6f92a04d85501 +size 631950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 630c86b527..e708fe2ca2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c957b53de9c4902f3959c1208e3bfa7155a3a00a6e585b475f1ffd98d0361bb8 -size 558698 +oid sha256:91e55d65f4f29a313d6e78ad84f48b862838ade08b617de5eef59cded625f716 +size 535099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3e8fff08dc..92d8897385 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c46f77f43e0ff2e41cf366b6530e3bfbda4c3698d6371ffa81d33159f4c556a -size 632557 +oid sha256:30308e105ddc1571c16afb02d640b2b87e0a84bea8521bc347fdd82e32280251 +size 604715 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cf8f46450f..64e82c64e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:829c034966fcdad59f0c4b036a1908ff1d1d25bc3dc53050f79491829036373a -size 530082 +oid sha256:31705902bf8b41397d09d7da273052c9df23508ccc40d287428db05ce2bad042 +size 509395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7f6759e2c8..6c29849001 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f7bd4ccc8bf46846d09edd40f33de79f5afe290b35ec8b77afa02ac1dbdc04d -size 810595 +oid sha256:08a2f989aa960cdd0c799a5d6bc3e069b0683712e0324f993aaf627e4c4038b5 +size 747974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ae5e4c74be..e7fbeaf9e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02977f87f7449d99d34528c65ee2d640a24c5d6dfb804e610513ddf5981c2ccf -size 793913 +oid sha256:07c49df7354963da1ec242134c05d17968b9533d02f7f2d6aa886fc7871ad08f +size 749054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 976dc611d3..6dccf5d9de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d1830cfd7dd19a3386b8f141d14926fc5b917202830de124f94e7d3ac48c3fd -size 736871 +oid sha256:bb4327f2f98c9ce7fc71046905059ae4d390da729cb0c98bb4d13a3d2ee549ce +size 711644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b0444b8c0a..38d7623af1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cce285387ae0a62c1de3810b39a5fb9fd169dd37a92c595b5cb550471861c32 -size 651911 +oid sha256:1489eeac6aadaec3eb4d53fca33290a28a132f233b3565645600e61dab88c62e +size 628264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9a30166f19..1649da2aca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f34761f34bdc99b7756182c423a14c3b3decb0b262c375a35135a834ad43ba2c -size 800925 +oid sha256:57116fa0c5f4d1a6abaeb83c9b4cbf2f2268365ac9a72a3bc7ce768bb072982e +size 740820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3bda33334e..46431d0267 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27258ed83d9082c8a89f804c24624b16c527754a7323a7693bfba2334db596ea -size 784245 +oid sha256:32ff1939e77f018b5a69517b247ce889a80a2a9b904050293910ccfcd3bd76de +size 741950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 649aaa9f78..aa8704f91e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:076f64da6409852bd99039663374237da4398356409cccce8e090ee64a168cec -size 727103 +oid sha256:3d781d51bde81a1fe6290a8d0c1b7e6cdd04da606fd9b4a768633b6549a9fac1 +size 705280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2b6a7d98cc..8b2f413e6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d348a47ed3d4c47723f9ef75ad8d8a65fffc4dfb53d19cd41a51190bcfcbdfb -size 642241 +oid sha256:d5031741bf63b0256ab46c57428e9d5a1892a5d368237a73c40cae4931f195b0 +size 621110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9f0fbaa13a..2d1f262175 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5d4c5e6472d6b49d616a92f68c5d4d611105c29e24a45b8068adc022f144654 -size 662195 +oid sha256:fd7af72571b145b67a77d147cc7b3b9368de54cff5c45f3ffe80e0869f537b20 +size 627300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 596c605659..5f182be01f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e9ce867c61ccf9c2e401509417ac5c336c7dfe2627a18d6a1070ddce4dc6808 -size 615376 +oid sha256:3d7ba506c937c1936df2919ab667cef4e25af9d92838f88de45d280963ef278f +size 583589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9c145fa29c..a300f9b211 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cd96b40a4243dfc1e9a5ba648cf522ed9367107a9fa84808bf18ec438b18e1a -size 652669 +oid sha256:f12920444e0d8e07b5213ca5d14260723188274ff108f09a7c2f7e4e14646a94 +size 616243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 930d4accc7..7a2c98f833 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc2107590974dc68742ce3e51d73f8eea8e53f91bbd96167947e734fba2632d3 -size 609104 +oid sha256:ae759c24c281db1c64d8e25bd5ea0fdd63a2397474b057b2a93d796d7e0ef832 +size 578057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5f0b3e4c78..86c5692c95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db5e1688879cf9c3dab10f226b1678ce080ab5f990e12741c0134c7097f57d79 -size 646191 +oid sha256:8e5752a8fa70b44d3ea90364bbce1045ca2976b2ac698cb6e1fdf5936bff0173 +size 612529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d133673eea..9365d75b51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9890b141af97a869534ff2682d6580f50071507e6bce15a759e264e8413c07c6 -size 548058 +oid sha256:56153a17e3d8718297f7b184b3d96b5412332783fba1c7b6f6884ba728325be6 +size 521155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9518d60d6a..13f854191c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cad9388e000d17e16fb4f34a99e4630d176ef3887deabb973e51cbb136cdaafd -size 599272 +oid sha256:bcd44da2d20550d51d3d0c7d218f001624823a628d7c274cbbb1ac390d7500fd +size 577353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2d6657ed38..aeb8d95d25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aee1fa2af26df3cb3bc99c5db574068725b1136b058d89a2b3e1f81ebfe1cae6 -size 519592 +oid sha256:3d0cdc441ff9cc39a63c2e48ff7ce717cb8677e34f44294b0e127d168ffc5e8b +size 494415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d0a21f0404..53b156ceec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f1f92a916a99407f9673a542ca700dbdfbe81a8418def7c9503575a2c3f98a5 -size 652527 +oid sha256:0a962c2e91c26006671cb2771230c80c61fa39c9d6743d38c1388f05f686fc06 +size 620936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2295384fb7..69211cfaca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:006fd50b1779046f482c6035995a6f3fd05d1bf6ecb908f00135d00eeb2fec61 -size 605706 +oid sha256:0ccd7482d5cc18758da775f2f55b489ea59056c5e27c2f96eab55f58fc890745 +size 576435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f7c090f7b0..d192d82f7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:903bb361b2b6e2f41171834ffad414e39db16d12abe4b0756bdab5bc9b4753a0 -size 643789 +oid sha256:f70ea77c5794aeaec3a1b802f6d3888bfd292890b0635347cd9b3eb06adefdf8 +size 609089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f66811752..7abecbad13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd5561fc8cc9414124b4ae250656aa1197b2f80825d52a4ff71020353f070880 -size 600224 +oid sha256:15526fc53305c125e2f6441c225b21a0527198c045d0a5cf86481443bc51bb93 +size 570903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 289737bb15..36334d6884 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e46c5c5808493a932f48b678462f9667f5d9d66c2059d22760719cb3142dc45 -size 636473 +oid sha256:4df9109414941d84f1c81ffb58856ff552b135ff90cbe13c3a52a389cb1d0a8a +size 605375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 776b3f33f5..2d88a408b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba08f987b01ce197f1648a00da3bd641fcc4f8b9e62c845be0c1fe42a56848ef -size 538390 +oid sha256:19fc4182ba2fee649e0592d27a254399c478bec9ff84f00fc44b3cb6bc498a7a +size 514003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 95a36c8d31..c36b9eccf2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b074791d9333bcaa531865a9ecae11949d16d6a992879f1db40fbc71cc0ea1b -size 590344 +oid sha256:1d097a7ddd9698a3958ea03b59f39fc778bdafc61cf20f513b75e0bc2037fb2b +size 570101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d021f03c20..ee05a4e15c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbe4d25ae7907d4e9bef8c1457afebe11269224664aded156042d2f2a99925fe -size 509872 +oid sha256:8bef16385ab92961f413bf28e0aa7032cc692aa0ed3712b29f39ebd2003b3f93 +size 487261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 46905c1847..698207c962 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:053cb38d6f56106f9edf1546f241a997ce08677f8aa7fe65dde7453d768892fd -size 681371 +oid sha256:6425806ec1de1693e4028daeade402a32da45de87c7f1c053637356f3f565403 +size 646032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6d1f52569e..00fdd478cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64759758f8a359300adc12c5a6936c10bb3c74876a31f59ecf10b032258ce59c -size 632183 +oid sha256:2d4b7b154c6aee1cd825c9c93530163a601c90e2a7ed9d7fca7a11aa5a175bec +size 590381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fcc379f28a..3353b17847 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f9dc043949fe49d1cd9fd230d4a1989d8aa2adcd11c855f54b201a49169ae9d -size 671055 +oid sha256:96175379e4c2326f98147d94925f8304c0a2b47dd364cc6f721b1e28f9366e55 +size 634926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9595add972..b91fb6ed68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6ddb0f6d908236091c095f9c87a7ca41376a3ed872b2f748f72cec8a3190b93 -size 626701 +oid sha256:9e8990150275299537e537933a1ecc3aa881ecb9387a5a582c9bcf43751fa4ee +size 596147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index de5c21ed68..d97569ece7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5171574c7c86046acb14ed9feb981f8a5c3540bde12be46f56c0ac63bd01f2ff -size 664429 +oid sha256:703f26be352700edde5ca939ebbd005bccea91cb8545bf9d58b9d19d85aad608 +size 630472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5648974f68..5e8e6a6f5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d92248bf460003e59f8c6398f1491b4deb816fd1000abc93c9fbb652bca9f38 -size 559242 +oid sha256:e8d1b7640e1dafa16c39e18b38f98a71a70e39d2af690eb08e3506ca515c16e7 +size 532685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5fd3d2fce7..a08046ecee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f5c60e0d92d07fcf352e7274181ee9941ea0aae9962fe0afefa0cd500e23398 -size 632213 +oid sha256:a93f2ee5d8b536b94fc7918f39ffcda3cd00fa33f7cac64d5cf5ac2fe344aece +size 600425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 819ac24a2c..f45556ff53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ac557fafb8e5404f471d394c1d8a191b839e7ef59b3a4e78b1746f2007879ab -size 530034 +oid sha256:efe101423ac3867a5f50d0410fac569d325b294817c0174717f363ea76aca887 +size 506831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 34e8683836..6abdddd4d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d278d999a15a227c7c2ccffb7e745b5428581416fea089f74ff23cfb10f83d7a -size 671701 +oid sha256:fdbedf9bc9f3072fe0465d93b58361a192e5ff77e2a5e46a6ba9f4f37de6f75b +size 638878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c587c7ed82..a1e2471e3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9b9572bc0654d7e1b07f32ef3fd9d366cf8bdc32c7aaa9250d7f9961a0c8606 -size 622515 +oid sha256:158636c5c511f7906320e15abf29d7e1753f0a280fd1cf5f2f1d679b37bf0b5f +size 583227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 16585be087..ccc3e0a2d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:469579661cd1a88b3dd6a31389c51e1402e887cf842563e1ade3e2c18b59cab0 -size 661385 +oid sha256:2658f0692d41e69af9fc2c947c41e6497fe22f7168cb477e9e9ca66b3eddb063 +size 627772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9a29782393..0a656a642f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f96ee33f8ef953b86a32643bcbea57dab5ca395b0abea91c9a785b28b7847f3a -size 617032 +oid sha256:7e23fde4d5135527fa2bb781325dafb142c8d96c9fa98d8106684a6af57e7420 +size 588993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0da781016b..e55e7daac6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b76f0fd70688cc24937146866130dbdcfe7fbb6adf1ec48337b02b1d5e7eb19a -size 654661 +oid sha256:1aa946be110a7e9665eb0d97a0fe941018a0c57a595838f721c16d543baa603f +size 623318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ecdcab62a7..0affe2af19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b4f3a26a6d0466ba6f148c6995c523405bb4041522fa51a475c97b33ac0cd8e -size 550362 +oid sha256:cfaedbcb6f0660c85c7fca693dd978b6cc824e9df09879d404e3f367efec9c74 +size 525581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7ea2eebcca..051e4baa1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93896092f9d0cc18e9192483aa58900c4ecd39389e316bcd7c8e6aebe75006f4 -size 622445 +oid sha256:446fc8b51da0c446350a50215de2ed736eee6ba0ca76f10f4bf2fc51bfa7f767 +size 593271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b9a563ac1..ec18773b58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:866c51c1aab8c01badc2212f0eb8e45f6a5123c431be24edc57213e3ffb5ef08 -size 521154 +oid sha256:435079b540eb5ec4174ab386148459a3c8892f0c82e0dc04e6491aab95c4f515 +size 499727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c1dc30ea9..ce46a7e5f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c52f1b941b345e1d08b58ef0e0fc6ed5181f300165c7544e94e72e2b520efd8f -size 778179 +oid sha256:3989b6b5b8f9d15032af2f392848faf7c5a95cab425a1b2871e6c47bd14e105b +size 725868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c9893994e..ac44fc577f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b7860755f579d251a734ea47503b36e789b024ce19f33219dcb35e6208cfeb3 -size 736391 +oid sha256:2e817824f2f489b87773880f407c878266d3e856d09cf7fa0fde8951cfbb8b94 +size 686350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bc2e4fccc..23537e578e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d5d7e94447a9229365c2ed7daf40f70be03945e5fc913e8534cbde249f35a70 -size 763717 +oid sha256:359c88d92777577f2c243c9edd88985bfacecad86e5cdac3993c378a6d1c92c3 +size 708300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6b73d7fe8d..cfd05052cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f5c65353cc930ceff60c503603e8f2e1dce4342bb3666ffa6ab56379a10ea76 -size 727259 +oid sha256:f371f75a4f40d05a5190fa25e9d351e0b8e4beb6b6714264494007c6ddc6c2db +size 676872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index edd84c7cf1..855bcc98fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7b12f5abb7a1417a52da41c4d98d5f0631b4ac3546bdf164653545a87103de0 -size 761139 +oid sha256:4d24609ca5de1cb6b837ca5b8d4faa78502649de820912085dc6bc263c771769 +size 708780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d8411cbce5..0165fd2e07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:428a6cb6744d0f66637846c8a24c5a5f1aecebb9086db7ff2f17646283fc24fa -size 656051 +oid sha256:5bbc02fb636319b82cfa0da5144d9eef217d8e33338e4dde35ddc47dd73853dc +size 618096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ff52a615f9..bbc44be1f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9722d482409957c083ec39dfab3a624ed5b00d29dd0312f28d5051d68212b68 -size 705981 +oid sha256:6589d404642dfae67cba291280e6729a4c803b00897807e65011559542f8c1bd +size 670346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5794f29eaf..6a16c8c30b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cf3bd69bb1fe8c8c5b431d194bbce6a50cb04a1cd752dc9c0722ba4c2127c0f -size 624377 +oid sha256:932afe792c0f5d6a83cee3877bc3b848fbd4251648313f08f7e90225c77314eb +size 587261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2050b5d27f..18db4cf4c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36bbfc8061349fa29f02eecfb547b98d3ccf121cab8b6767ca4caaa561786e05 -size 758001 +oid sha256:32e69d28ab460696bedb9fb55de794687b4adf975d6a067c86681c98ef42cbdf +size 712352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2fa3dc3f89..52ace4c417 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85dcf6550af219264c09cbd512bfb50ed0bd1df5b5ffd5fd2f05b1656d571be4 -size 716263 +oid sha256:9c62078c8da758f014245c5b04665f04a0fd5f4f4226a72a04d6b0b8cf2a9bff +size 672834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c6187a9cd..a0e8f96a90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a29da1eadd3e4060b47dce6bd790b34e32a2f8f0f6e6776d1d03f22ffc480ea8 -size 744379 +oid sha256:91c1b2fedc27e1aa98154b687c4dc87b13b23d575b0ffcf8b030deb5b7be9a87 +size 694782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index de3ef9dc5e..f3f82d92ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a8d33282ba09678773a3f49b141f98d2c1dbc7e7c168d61160a9e4b5f527c6d -size 707919 +oid sha256:829c5ba340bcd1689f1dd553a92ee67982aa60c4eff232a9aadd9e407d8a3213 +size 662566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fb54515ab2..f2c5c07cf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3d57cc34109148c1258572f872dfe07347d892dea4542238d2c517e71d1418c -size 741701 +oid sha256:00c178eb7aba46ac1d74889347e9c0d7e8bb876b3692e37a00afb1a7bc1e608c +size 695262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 914aa3fb2e..ef7ec45b81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03bc99c398ae7d36d1770cc3b96bc66212f43f51b9c5a28975e17c5e7d125b92 -size 636663 +oid sha256:7db7ea2bf06b0c576013a360b26241a477e943317e473a918e1e2bdc941a95ee +size 604579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 982bb25f6c..dd4a297b33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fab0558f0d9d8a737b6817cbcbb69569e0342a5b35b04f1c9d6401986008ae4c -size 685755 +oid sha256:54ddb9bb70713c4d7c26afcec917596eb6d7e2d89ba4612b04d76fd98d631d4e +size 656138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d72cd1be85..61f27e8981 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63370f13d6d303919f24c89782a31754463aaa329857fb3e9ea060ce62a69195 -size 605776 +oid sha256:0d94e20da36ad79426afdaf3103bd4bc282fcc9f57200ecdab66fd2302119f78 +size 573743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9f57cc00b1..034f2601a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbaeca1ebf9b52187fa0c336162698af2cc96964a3b1f2b63f0e4f7026333fa2 -size 797353 +oid sha256:adefda7abb2d31de35d5497907cda4eb48d8a5af4c08055a17b245473d1b7dcb +size 745488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cfe94f0368..caa5aa3ff7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14dfd023472188bb1f7012364ed25c0b16249fc9bf475e916727d1f69720f149 -size 754037 +oid sha256:6bc3991f2e2c32cebd4dd00d52c873438c102a81bc46949881289bfa06e5bf55 +size 705872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a5dfdd1af..b4200b1cce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2559f7a5296df9157d1cc0142b1e7bdfd67cd6a2636c0bdf07170b8b93630632 -size 782893 +oid sha256:ea43358d250c55ac87c2c72c62a6c1c277286e67924ff26cfe9d298cf8bb1d98 +size 727920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 76e3d8708b..c5d99b1395 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddfc333ce1d98e511e7bafb6c8bb33915ba8c8d7de985e0dee37021fc53e5880 -size 744855 +oid sha256:a081f352c9b7a15b471750dd5effcc9b4fb30d76e1327416a6e71d4a50f64c21 +size 694814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 976519497c..3f3f127c4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6515fc42d1571969b956d4f4a41c1242a2f1b7ba66717543a8dbe67e6e7839e7 -size 779969 +oid sha256:cbbe0b401323ee046b87c50494ed54269fdfcd271266e6a7f3a0be3707df80d7 +size 726770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e4d6bd5664..6ea43032ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aeb37461b6d8875bde07be9a0b48afa3c62ba6be53c644055177bb928f5c147d -size 665113 +oid sha256:e373d979beb515acadfc7cbc23ebe0fe3fece2ba052ea96fe5be0a3f51a88b05 +size 625530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 40d913d425..2ee78009a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67a831863d6d76690c7d1c6d81c0a114b3785389f41cfa94bdb26aa6a249028d -size 742227 +oid sha256:986f312548c955fd5f68000df607d18ce8615b594c1c485eab889926f372bc48 +size 694110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a622a87ebc..fb45752910 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a01ac272bb15d120af500644f4e6d2013201ec247074376b0ef847cb2a6b134 -size 632599 +oid sha256:892bad2807a3ee88a21c64b5c8ea0e4ba3d386092c241a887e2cc4b5893dcd1a +size 595533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 89f9451954..dcbaeb67a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:401b8b126c3c8f438fdfe19ac40eda43ef04ed8a788a3e5ccbb11861cdd906e2 -size 777225 +oid sha256:23b8b0996cd025cd7cd6c39ed09f0614c65064801b1ae3cad09141b19e991cf6 +size 732020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8d1f0455fa..f830d2112d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dadccf08bd498833f0a259e59049edcee0b5b21000ab49943cce83ee029d7da -size 733859 +oid sha256:8798f19738b3067183f250f6be867b9cc818b6ee6abec4b99cd8ed9dec1ed6d8 +size 692404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ef02de7d95..4091732a27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a77cba76b907a8b82894f12733ae7e4aa75caaeec445d38fd31daec8c98bb78 -size 762765 +oid sha256:a099e7e33957a44cb638d53d34c66ad73ef2c68ad2629ba738d33835f4dcca3b +size 714402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e737110129..6f21366392 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d101629a6a3c96704c95ecfb3b5137c99750f22c380f05d1490ca383143b4d5f -size 724727 +oid sha256:c9c2da872e0a96ca950099ef8ecefca62fb9b508e706b236372a5e5361121b11 +size 681346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c779d21444..1d05db98ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ee3357db3013774687587855c0c84c0e9fcf8f7f7275072dd11850608c5d5d3 -size 759741 +oid sha256:2f9c20483b2299564384f6e29cfb674524fe01c5b0137cf357fe4408cef80789 +size 713254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 205fc149af..41b5ec9e21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:972d12c88b0a733dc5f965a228610018fe9eec628853c4bdbe62026b3d5238aa -size 645725 +oid sha256:09eabecee6fc2338d159b3d7bb70c0013a92204907c04be2da11acf6c366df43 +size 612061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1b745b14d6..dc5ea4f1da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2248f050185d343462573edb7f22976fc36b506057910fd2df8c3e68416db986 -size 722739 +oid sha256:a37b686a7c8d7c978321c939477b09d00237f6611dcf8c04b804eb32e437f458 +size 680592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ca5c7e5cd7..ba62a4797e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:856f865699610341b9b1239cf1901e58c639518907761f3dc503f09a778cc398 -size 614050 +oid sha256:b8c2a7d72d2d3bcb14b5fd7afa3813640afde5224ebd3df23af6ac7c1651449b +size 582805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 448dc06b5b..2fc75c0213 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a86b87cd694dda5473c8cd6a3f9b7ba19f67334a7bc37310b67910f5a36734f2 -size 740885 +oid sha256:dc74d9bf2ff2ec12f03ba416d6f8fc795722c6d641ab2bf05d38ec3db6b15362 +size 688476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 322446c7ff..1ac4f1f18b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11512f53a0240fe319a170dab8448d26f745a4f5c15bc4af7ee9eac2553735ae -size 720553 +oid sha256:52ef1d50bb70770a89c7c054a4101950e7b7d4fc93be8045cb5811e8a2d10864 +size 671400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 45f9b7cef4..b936b02a59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2df21ed90fbaa773a2d400a5b3e0d0c78765998c8984c36aca112c9f06e00c3 -size 686105 +oid sha256:5b1fde997e8fb5ecb8f3f95dfbc5462e96146424a6008404a83053ac08540fbd +size 665762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f4e8940c03..c6284cf095 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48558dcf86d84cccc728011767ebc43a2f9210c245d35fab5c93e7bedca8e20d -size 601932 +oid sha256:9f5005f964b03d4fc630dfdbae855a6e6d2b81c546bfd9658a428e3b2468604b +size 574487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9771e89ff0..eed959bb1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:922d9d511696849dc84b4c82961282b2969d490cab5c505e3685cfa7429fd985 -size 731165 +oid sha256:3870b3bedc28ef4934c40338a485a384c83b7e7fcbc117ea4fe0ef2569fa6ca6 +size 681322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6819e8503e..e99b079c0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3a72036d61d88fe7704e7c2269582fc77e5e11ddde00131fa9a0723d4c0ac56 -size 711673 +oid sha256:5e2c45d9b75ed3eb38d2f42b25007bd940ca5dfe71151a952ec97619e30dd4f1 +size 664296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index a9e6f7154c..c83f32aa89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9daca004484c86a401811bed8722f0377e34e77ecb0c80749bc41e7a274b3edd -size 676385 +oid sha256:00bb64cb94302f5182cdc941fe0cc51d1e3a4bd73776932edbbaa9b018cfae4d +size 658610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6e71667f57..35bcad47ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68d7396e5fd22d1fc9224068425d8d8a3773914f6ee2136867e36c59878ae2cc -size 593052 +oid sha256:66a3b6ea167fb51fd8267fbd1cebea369f8872a9dbddc3a4473b923b11dc893f +size 566545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 374edc1e36..563c1aa00a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e9fd3fd471ff5b4f2456ba9e3b1feb5c7c7bfdb34e7fb0612e4cd013a555c68 -size 640339 +oid sha256:fce7db32e3517c013960347af1b0bd184e2ab8648d8bb7adc60ea7b8e2470cad +size 607071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2544561b06..7f80d8cee8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:615fce75e9782edffc0a9c6a475a9710793af3078121db3c1cd1311bce79abcc -size 614386 +oid sha256:969fead0889977f59a9e6c8025c46f71d04d524ac07fd7f6b5de93be11d94200 +size 583783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8dac3a35d1..c3a9b8a13c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fda4575571b192af9de4a18ec3bf1f3d4ad4a401fd94a53e2384102bdbb90c9 -size 633229 +oid sha256:c805a650db129de6695b2a4e859217f489c8d5685040569d9bac42b24a6f62ae +size 599517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6ca42c0376..647a4cabbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56743bc582a132e9ed8b3f34858cc1e15a0dc1ba2fd96e0039dc10b172367497 -size 608116 +oid sha256:39df2855af7db083e02233227f6115b01c671760a17c4992273c688a71ae40f1 +size 577463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5e0077658d..03eef1df98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a69adfedf539fceffadf8c8cf42fd4dcc58d168737a5e185f155f46bad9fca -size 626901 +oid sha256:515c18f77d0cb4be39f8ddc9a02faed18441dbae81cc7d6624c27261b5492c4b +size 603401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1f3b7e6476..4f3f38d1e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:030ffb0aed84a56ed1338be8c8cca0d708b306f7866464203d8f679b357dcc04 -size 545984 +oid sha256:d5d746c4ec6e65ca8756fc06eb7a9c96d54c23d12f0fac1151c266bd748014d1 +size 520561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0e4be915eb..953be9719b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9e0fa0cfe0a19666e5945024c4bb3107c2b0092d9445067a8e3f4a274f86088 -size 598334 +oid sha256:dabef35bf44f7dc522acec0a14d70884e38018cabd64c2eeff58c018a2cb3175 +size 577647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a39b2f4fb8..0a2a9d2ac0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07865be4d397cbe048e8d98b23ebaa2be9f3e45fa8a25fce95fcc917f8ccc16d -size 518750 +oid sha256:a675ddcf2242f9bf97592732a3e1d850224560a285dd606d0b1663e036f9f00b +size 494363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 81fc92322f..e2e9957c55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:703993a4d180d9ccc5e6eb63441d53953dcee3523243c3c2b0d03387b7bbb19b -size 630621 +oid sha256:1ff15035bd9d900db62450124f0eb7d5581132468d96bd0e9d8ef698042bf04d +size 599917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fefe3a3473..d33e71d246 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1076b51571fbcbfad56558c0d19c91c8d92878f0df9a989c09607d206c298c3 -size 604718 +oid sha256:ce93c352bf688d1609018bb31d82ce47316e92353eb172949e264786f8e0cf57 +size 575841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06e0f54442..e9b60c8569 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41f5c16a432c2f396ab1442b782cedfdb9724215f52a0995e8e5159495aa1655 -size 623559 +oid sha256:dde192aabd3e1d8e9a1e06c594d8baa0e664cbbbd3b6e6533973380ffc521999 +size 592363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5f66bf6fc5..45c1fdb0f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:516c7fc9482b1655f53160c4de89e609abaea00b47b934f893415fdc834d9776 -size 598446 +oid sha256:b2552792229b4799e78a0c21e1bad040e005577d13648ff89e935ad06e974a52 +size 570309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a373ff97ab..d75f08b1a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61a38cd5868701887884898433e3d1f59c683abd7ba5187beeb62f304cc92e1d -size 617082 +oid sha256:7c7e347bbd602f962e1053ba7ea26bc9e5acd3ac446ac83129cf766f77b052f1 +size 595457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e196f38678..5e1000423a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cfdf9c2d645db76d16394df8df583dfed9ed28ff32247f1d42b34a43b78ad60 -size 537104 +oid sha256:a4d54d5fb1893ec7deac1801c08cd06386934abab15bdb6a28d89d38d7ac701d +size 513409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4ebfe6afff..a6ef80c382 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b62006b9a0bcb8f2ab5b8b3d01d780c6fd42323b2f17936072aa67fb5d79629 -size 588714 +oid sha256:c9584191c467580513abbd773e641cf56a09dfe04f1ddef1604fc3a73c4045d0 +size 570591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 58b5a12c8d..1cd7db505b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dea4b22319bb25a0e826a113428115caaa1d07c21d48eb3bd87a4d29d5c6c679 -size 509082 +oid sha256:9bbb15b3bd2b9e4694761c0780f7c3e397596cc421195437ecd68faa8776fbcb +size 487211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 331e786f22..87f8f7588a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22185fc2cd6802c01f6adb70f4b5bc556aff7d073c5fd580c7e1a4b3643ddbfa -size 658725 +oid sha256:edb99d0582ba5a874c7365fd427f0c7bd5990e2cec45ece25fdc6d78bed04417 +size 626544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7a0a413c3..69d9bebe3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c743832f1f1fba6fc5ebf1dbe83192d35e01c315352e0ae427ae7aaa99b11f9f -size 631195 +oid sha256:532a71d241e53937ec20f7536d71fad2ea208310c700ea1e7d78e9ff4dff5966 +size 591119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0ac3c6ff1a..546c1817bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8472ad79550bbb7f393dd90500c8a0441c9c6347bb153ad39b56bc2afaa836f9 -size 651665 +oid sha256:2ac36dcf15d418443309604a9d620536819ea88f04684cdaff07b8671140142a +size 618200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bfb4de2a64..7ce5faab11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2aa3ae461e0a1f50d65b1d459ea90b9ea1cc22fccda109065399a7447990f4c -size 624923 +oid sha256:afa7ec8afd6607322e4d53e124008af0597160f6ac0373e62b0e80bfe7ba399f +size 595355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7afcd29f90..0aacf6179c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc6b9c99152b15ab47be323fdcb7cbc6918a7c1cc90276f94c50ce890ac70939 -size 662455 +oid sha256:bec31f5803471b4a4b6939e8804e6cec0a6fa9a6d5384b0daf29ce2cec775dcb +size 629976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aff1c9c449..51afabe2fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:803654573cca2f43c1d6379199e0e9d92a946b35dd780bcd0838e194a1954b7b -size 558056 +oid sha256:7aea51fbb9e68a4dcfc6d24ad94332bd302c28b07d142e8eae92c5deed499b2d +size 532337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3cfb70666d..625a49f82a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c19b67a3e3650343c85d816fcea81da7d27eae23c4fcefc767db000993e8820a -size 626833 +oid sha256:e9a34fe6a4e3ae279a2737e7405771766d54854c92a5c597d08f0f17c313106d +size 598795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 295a2d6e64..ef542758ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca6b1176ea11c941e23b5b7f8c1ff315d8c7415bb47057febbb05b8fe6c776c0 -size 528454 +oid sha256:1874e1d7cfae5211aa173e09c9ba933940bb0fee428b81b1c1bdcdadf7a073ba +size 506929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 75b95d7295..0e6a19c306 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b641a0111d0df927f5e456428301c7483094d9359c94de6025f352658ce97b5b -size 649007 +oid sha256:b6a0b5cf4f87dbb3b70c1f08067f529010ca0bee7a20c8358b9bd6f55a97d74a +size 619390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 81b2be38e8..a5949f56d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed06ff92c9e40f32b00d16147b6754d123b66d9855d71f568ce17ee2fc8d7ff1 -size 621525 +oid sha256:5b207a3cc80d15b336c934a29f7eaac24d94fce79b2c9789e4375db8b7b9146c +size 584755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 262c916512..3fe6649214 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76684b0b23b5d11c22639cfa4b57b1a53fceb0f7f61757431db80a620d9ff8f8 -size 641945 +oid sha256:aba80e6cc45ecb728457c7f3e8a74997ccec78da820df3ff661bbf90baa97352 +size 611045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5df3a83816..4d2285fdd7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13a49c6c3f786eefb0d9b2e3a9c3fbc93b6528d1a85b3bb19e1ca071ed1024d6 -size 615254 +oid sha256:2c63d61453af3e37213334164b411c20339a8f24acd5192032beeeff5e4cda61 +size 588203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2ea3b6b890..6dd82b89ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:799bcbfe48fa28c0f463f29ee1260bc319f27045c594865a583d6ca8e86fc32a -size 652687 +oid sha256:e9cd8d77118a9ecad2324cad266e94da46bb8754d9ec77063a6cc3c99e5f35fc +size 622824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ea636b4e0..6cada43ede 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71c6c460d62e223dbc23ec950714eb7967d9918edc212759eeff83d802ca5969 -size 548386 +oid sha256:a9738fee11fbe24490cc44c4cd85820e964d19eac52ca18a49b585589bbb054f +size 525183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 969ad7477d..0b76ef1062 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b25e40fec3d384fd7fc34b7f2f0d78ace4857a6ad3dbd18dbc7008de3539944 -size 617064 +oid sha256:ae9ae53a2ea7679679019b0d65dc2b030736b77b42636871569f85f06f3cd5ab +size 591691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 39a74cfe26..315fb53166 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc5e6e5e6f3d70cf7d7748fc72e9948a4d83fb99080626e0cec3fe33bd20216a -size 519524 +oid sha256:4d57b15b5d72394fb1b3e2707e8f371e341c07e6825e2d0295f7cb6b20840f0e +size 499775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7748fa8ddb..853e3d3599 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d5e6b1fd28613a75f23716d59a5dfea194fdf096b145c98a29cbc0249c4f8aa -size 810495 +oid sha256:803b2a2b39f8a87e13b12c3a005e4c2e3764ab9e69d5d5500a11afe3ecd132e1 +size 747876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2e9831a901..589398e0bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f38bf4c91a879b004640656465b9b44c4e8722b4c0ef1f302e4be2742c250d66 -size 794605 +oid sha256:5b1dca84346ac1960c2aab5aadd0a4925994a85e9f7f7091f79f9c72ba27c300 +size 749004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 783f533e0b..51e2b00d1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4be93b8ac5e9906f2598520ba62f25ab663bf6756877dca2d71ecc5abd885f2 -size 743431 +oid sha256:ebdc4e1dfaa3c738990616c277c477513af7052ef2f56b40073b14d5a5a62663 +size 719982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ea5c19a459..f0e238de2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c782549fd4fd5fb5591b3c1b54d5b11543e0007de3af03a290b0c3368c638177 -size 659261 +oid sha256:1195b99a21d07db863840debfe4378094f8d2880e8f746166dc262606ee2ddbc +size 637884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 42ce4c2f45..03d2df3639 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cfaf0b9d419db15979a1735e52e8497bfa27c42eac53bb29d682c447c73a078 -size 800827 +oid sha256:54bc2f0c87cc6caca4709afbbeaf0c8d91db12e568d84eefecc9139437c98fa1 +size 740722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d593f54207..08bdd4760d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2c37c79dbfca995fdc451cb0e64c015c5d1c2752ecea248fb4b320fd50898bb -size 784935 +oid sha256:4c3680e84dd39cfcea92bb70547964bfb8237ed3c9e893f84e0d93f9283c4bbc +size 741850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 291bdf989a..e4324ccb13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8eee507953ead5da042a96b2f3da6f4be59271cddb537077ccf1725b78fa75b -size 733713 +oid sha256:37c608c70c61737af3634b81aee927223e776fb7ae8da2686fb2beb7e0acb4f7 +size 712828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index eb1bfc876c..6d36f5a92f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:734fe0b3fc8c92e82a70aa72c567270fad925fa44717ef086b256e7777a460fb -size 650381 +oid sha256:2bde6cb3746d155194dd256cadd2eafa94f7e10d7d81a755e16ebe2856499f04 +size 630730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3fbdf89d56..fad005be90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f124b3740b1b1ba1f31a7344ea9881e7cf4d882cb676997b9456520d8f1c342 -size 662097 +oid sha256:0e0cd7e6a3e201c31c860b7d2b56267f535da0f76297d91b912f3070c2e1e0ad +size 627202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9df00eeeb6..9bdfcba643 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:322a48627ec1ea12838863a348874106dd0e073ae93c4213efe9d2e6ae279157 -size 615276 +oid sha256:06205f6b259512f51dd93ed528fbbfa22574b2ac44fdffcd3ed5464373cd054d +size 583489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d26f09859..41c960af13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c90129aca0dc8457ac769cfc43312560a3f0f332924aab6c8743e80679f26121 -size 653359 +oid sha256:a63e7789cfc2c987af1e0c5e908f3598c2ab324f77887d6b937295752df8e860 +size 616145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 70d102d685..9abdb89909 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77c634a2021af2f6721e6d4f7d5a8ae61d6e4feca031c52a62cf08313a98240e -size 609794 +oid sha256:3f0425531c08e36403660e2a70609831e8902afa2b2d2904bbeb72e3a185dcd4 +size 578747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b564113618..9cc4168cce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3ba65ae063e8fc1bf4b0b362f411f54cf6cc7319de59dd09b0aad554bba639d -size 655417 +oid sha256:ee8e059476f5c986db3b6e5af326a87cc6b3da4d5271578003846dca3a336152 +size 620768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bc5aac5fa4..fa302f44d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a49367cf40a3e87f351c697a6c10b69d9ce878078d522cfe692d43b01757d367 -size 555952 +oid sha256:dcd91eed6305e4e7b8dac433e79bc5bcc29112b49ed29bd90db2484cff032cc6 +size 528359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0e4cc9c43f..b921566c21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a4c6f42c6a24becd6a995dab32eace0c6b118d42226cbc7c420771e1e838a3a -size 599618 +oid sha256:5acdda4b4a33382caa611ced6783feddf76ab1627e7dd79ba06606fe8f750b81 +size 577303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 571b05c7cb..f2c1f3796a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46af9aca35dd5b65654ee879499af6a5f96777f5912e83ea60c530aa5d473bb9 -size 521466 +oid sha256:bb30e15c288efa1e2237a98b2ac1384bea4fa6cc031c5cebadcdae6c4a9170bc +size 495105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a9290b5b67..1779c49c9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63e7a39f9287bb57eb90d82bf03287818e0c6875c516fe04d3b96fdd0c41ac45 -size 653217 +oid sha256:6b854641ab7bfa4ccb850d4c71e1e41441780adbbaa9ba8b4e15ea3fccf8fb4a +size 620838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 703cab24e3..0999ffc3e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd2132a172415ee628cf0b15a820fc7dcdf99a0332f2cab047b3c8d9290e8ad0 -size 605608 +oid sha256:a5aa11aecbb74663f5f01ff95d800d66edb5cc2f72b74ff1417aa989ce3c93af +size 576337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3c0c871020..537638ba3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a718a958006d45740eee29fe74907262efb789b7a92174f908a469aebd0777f7 -size 643689 +oid sha256:d840d3733d48ff7691f866110277388e2194c87e78b12e686d67698123ef59ca +size 608991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5604dc2202..62ae850ac2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30f2ded189684991ec885c1492c57f0f176d34d8d1baac4531bc9e740cee6c95 -size 600126 +oid sha256:c2d0eeb237c6d774bb321ef1f628e21275992a7b8118937482a838937ffc5f44 +size 571595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f44bff1e4a..4f0404559f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73571674917dadb04860a4649ce9458932f90ca74ab690f9c75570fa92e8bb8a -size 645699 +oid sha256:37780dfa4474a33a1bedafca9bd1562be1e4c5cdf150b95c1835d660fe523072 +size 613663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3fc84fe08..7a0d106265 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8d1207a89cfc333a2ffe3f9c9f3bff2635f44f81a4b36eab9c7900343a6fb9a -size 546282 +oid sha256:929de6cbd50b6595e6cd3c5a0555f358b2255256e0ba4120322251063a5c0c90 +size 521205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1db9336b73..be393359e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:403fbad9f7b898d835bee520055c369032659c675ab939e3aa8ad5f963a10b4b -size 590688 +oid sha256:33380e32cceb371a2f4a4a54bcb0a55311c8ee7ed3d85ac3f60bf25082def8ad +size 569261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 27d91e87ec..424a14e613 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe0e72f3cd34d21cc67c6aff9b8f7db1352f40b6622ea6797c1e2b00d93145ed -size 511796 +oid sha256:70350cef698e0d5ea4e286b6b14685258ccba3d54f17c94416c9a48a09e38cee +size 487953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f75f8de735..49a3d7f900 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85420cad1a3aec121bddbc3bdbfce38adc824c5a0db7ada34ac92b9920378ada -size 681273 +oid sha256:25459e80f51e6b1714b3c1a9c3141ffbd89cc0dbd0f0de0f7b98adf8c5fa208b +size 646722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 429db93b6e..cc38414525 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ef5089b8d0e137300c7a1011d24ee841193fefca4db83935e2ebc7eded7a913 -size 632875 +oid sha256:7f5eea1020b02377cec80d1932b786d015e7348dc13302fe11db87dda9bf926b +size 591071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a850f3a47e..ad5dc7be12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45c25114c2bb575c7003b7d56ed419c98a4d9a8edbef626318fd471d4e035ece -size 671745 +oid sha256:e714e92d9efbd73aa76f3658654b84b58be6599a406ae92c873f4245078e63ba +size 635616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9b2a0503bb..98065fd3f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3717937becfe1ba6319d8d98d854edd7e5c16b44ea20cea2d450ad0b8df0f678 -size 626603 +oid sha256:7f90070f21adc915bcd7be474ce5510afb6ef730471da75ba1568147edca5fc0 +size 596837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bc8793e23c..141edacb6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4e7c80f1072cdaae554e7d906fc9d1956e1c99c76e41cd4ed0c637bade29ebe -size 673655 +oid sha256:6c0df6d592211a460bf5d92d64fb3ee59dd347292aa36c1765e85c2279973edd +size 638760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 59bcb5f0c8..890e4c5ebc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33a4e0ae6c622d644a09b73d08416b5a3ab545f1504f424bcfe372964a57167e -size 567234 +oid sha256:48bd156742f6f8a68bd18f8fca4453f9c985edfd637e7e62475df6288fd6ec0c +size 539985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a382f022e7..6ec24f24bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ec2efc0a3dd509ab6e0b72721e893d8401a3a0401bbbca0f6caad2f281aea33 -size 632509 +oid sha256:534fb5eea1ae1a3ee169f95ef5e7af791c05222b0fe9612fc4c27162e3d002db +size 601955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 08c7459846..ee0e711aa0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c21e6d90257099e145f18209dc2bddf64eb2b1d2da13a1c92395b1f94c1740d -size 531958 +oid sha256:483f336f6a05b553d55ffa201cd7ad8ddd87be51afeb7bfba3b92f2139e00ffd +size 507571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 51a6edf540..60fd44eacd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cbdb9f75840c644f54c90c4929c40faadaf745fee5436b3fa761f59a437241e -size 671603 +oid sha256:1df55719e6380001a5427ac6b8f1cc9b8f0ef702b3673e32e93df0fb2494f802 +size 639570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 97ed484bb4..21b417aeef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a52e9a59516ca2dea80fdd37d137d4a7a0568beeec809dfa6d1e6cab2108683 -size 623205 +oid sha256:42b4387ccfedc8758aff44ee4f00a6630502a856b7098421b6d591b6f34e4028 +size 583967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2f1dd09229..13e2bcdb37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80f946b80d997566d7df66d1836cc2216565eca67993f72d4b2e544fedc6d640 -size 662075 +oid sha256:70acf4f3f0a1f8594fa5539d782f57b1896649697720c73983be197138d759d4 +size 628464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 07a2f5a257..64d18c4e4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cf3c3e70b42300a7df8df18fdddd437c3efb2b1ea5697cf5337a82df96d452c -size 617722 +oid sha256:729932282c0b9bd102b60f96b0463842e01829a232127b4914ede4a44edc8a79 +size 589685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a3347baf95..ec257d217a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21a02440887666e20562a25b78a6f85adbc032c32b5d1ac58aa240510838e4a0 -size 663937 +oid sha256:502a6158044323e75cbe33698b1772168866cef78d416558d498103a43da2c5e +size 631606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 19851da542..6350cfade9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0665603728aa5ec3da770945ce229cd13a3eae1153ac280bc70273d46fa4a6cf -size 558354 +oid sha256:ee038a5ff40a16b40c1e7cca9c58f932aa2ff16775c4bb419eaf9339d36d005c +size 532833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4cda7a117c..70127680ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f3323b469ff002493b12a5f682b5aa392f8f893167855e6700eaeec2069448 -size 622741 +oid sha256:e389553f4d57f3bc29bbab2b4402cfa628621edc8140308608ba83488ec11a09 +size 594801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e934d3f39..f4f7a85bca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4353724d8ed3530262bfbf6c6a257433a3937ab08b68570e46e9218ea50ce2ef -size 522290 +oid sha256:5f430d55bd5f0341bf064155d8643cf5a8125333ed3b90d90ad87f7df92f074e +size 501207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3a895c9335..c0cf025766 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fd812693833d3951cf2addfdd2c883bb80d43209e03641c8d5b0495a15642af -size 778079 +oid sha256:33f41cd9faf674d428de1e3f311f7468cd099a903790d4cb4bb1a0b8bcad78de +size 725770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0c603bf575..4ee7f7d168 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c92d8d35827202de8b7da38c5337f298ad2cb9f7e2c4bc5b7d9e84464cffe255 -size 736341 +oid sha256:5d23cb1f24ff5132e8622756c254babd5472a03a51cf7e8ecef0b5c62250beeb +size 686252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b76839f58..09fd951994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d176a3c1ef27db534932efea8f55220cec0497589a79a2d1f21b0fdbb84445a -size 764409 +oid sha256:c160ae10c616fdf288e243794f1b2e78bb396e84d817a1f6776b6fb00e93fb8e +size 708990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0b7c552f97..69ef29ec4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8acd2cdf8050902f01f7d705ffe215b7063736b93a0915f0555d9badc2acd6e0 -size 727159 +oid sha256:88a9384be4c7cb6bf35ed8eea80a89ae5ea198728697850a8db1b3efb5bfdbd4 +size 676774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index de426966e1..767cc38a3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d61520cab5a8abcfab46d503ea978e5c287f1e907f2fb2746ddc7ed8eb1ccb4 -size 773373 +oid sha256:c4fdefe8fdee639571f6f94f071cdae7465cd9b4e8cb46ceae29853cc30412ad +size 719288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0eb846c0c8..ea9c4b4623 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce3aa93b6f7c1791a0d00d29083c5557262d04bb6f11566366dccaa94fbcaac6 -size 665127 +oid sha256:265ced5e3396651e3602abc2dacc0b2c3e8c7baf21bfc28be4f3645d697e79e2 +size 625496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 84da443582..42c4683f22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7ea77b375d604a920eb166dce4eeb49507833d12eceb6743054f67447db48c2 -size 706623 +oid sha256:a89b9f2647b19b8ae0fca0cb2699c316945a7fb0a098d7c4741c42274314eb78 +size 670198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 76d27bb5ee..fd8e5fbaab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b620eb9b27ccd04e8477c21cd5803f85f2cdde2b57743ec5bb492521db52104e -size 626547 +oid sha256:b8d2c9a264e593897c05c6abf986e8749d3247b923db2fb2345b6c07cae081b6 +size 589431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a23396fa7e..f12f1e2586 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:252ea5c3045f6fb1c85e3e6094c6910558e9dc2d199c15cd3d9d4a86832831d1 -size 758741 +oid sha256:a84cca777cd487554cb0c02e057b5b0046f7f3beedd4696284d5ff1c65e8bcd3 +size 712252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 04087c831a..bf376673d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b05dfb2f63e58f1867f632f370002631713b2425c9af0bd117e7986119286c74 -size 716953 +oid sha256:9513be4c0b6728cc113ed1692398985aba5888b367b63adda3670e5986f33368 +size 672734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2cf25f2245..953771879e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44ca7a5b8b64a023ea48b710eb5d2af87bad7a2dca766bde5d68aa70d28e9521 -size 745069 +oid sha256:d5da4c8484644ee09c10aeab980635368147e6e3c4c4b2153e0d7a1ee891dfc3 +size 695474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d764457a80..04bd097f4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78876a6785f77590b54fb0e104aaf9e84994e8a18c2575563b5481f6769bb023 -size 707821 +oid sha256:990b88621134b4208cfa58b3e14f2e0f8966fdc05fb5988031a6821c25bf1ea7 +size 663256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8d6b62b15f..26f596b19e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e0d833d9c801b2156042495ef23ddc672d8eef71a182a715ae70625535acfdb -size 753935 +oid sha256:f9c55be2b1e31aefab895f20b9494bbc7899887dd82bf60b4f5dc3cc7d8f89ca +size 705770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2e89974ab7..d4d2d9c6a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:894615bc446dd7ea93a94899fc9a4f6db389ae3ba988608f825db54aed1dcbba -size 645789 +oid sha256:cc0656913671b10af03b193ed3f028192c3c68ac55a0c487cf1f3fd00cd0abc7 +size 611979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 973879dbd0..4bf93c47a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d4abb30828c9f81c3cf9aee737878d17fcb018a4a067ffc52924e5db6d5cb73 -size 687037 +oid sha256:39e74cd0a3ad2444ac618d78c70cdb765a59c123c624203f4bb036b730c47d30 +size 656780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ae24b3698f..c9731d8ff9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40ea14bca6fffb6285d30729c985df8dc2fe1732860666d5aa6802e76eb6cd77 -size 607948 +oid sha256:065117bdcbd44b1598379ec30e49e8e803edf41fefbb49674ff8914ba706ddb8 +size 575173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0f5ab35c04..646e8a4752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8afa7f34b4f1bc989a4b0ca4de7fc2d75f7fc32d1a543bb91d7fd861a4c4283a -size 797255 +oid sha256:5e486f4f227b29bf9b243cbd78acf67588477b575f4b85a93a17d509cb6703db +size 745390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce7af23ed7..971aada071 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51213d1fbe627a56ff524036443b11df9b32d4684cb2a193dbed56e0c73cc927 -size 753939 +oid sha256:46f9f362522302e57cd145e028d99be9b0eb7c3da6ed54f49b0da44d519299de +size 705822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 128dc8a00a..16de1734ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6f61bc6dc17685088aef35d64b21ddffe14bd7e65ac1fb031d9f9cedcf3ecd4 -size 782843 +oid sha256:aa152ca7f25b5b11689bd55b8aff352053f8b7f630f7ddc3e5ba47dbcf28f76c +size 728610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee87dbef33..2a70a10db0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32821461a1ca6b9e872d32c1f16f1fbe4742b409e4727a8f122c94b42b6abb4a -size 744805 +oid sha256:7cad5198568015bdbbc7bd50dcb9c9330410f9d928bd0f08c13ca17f92aceac6 +size 695506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b1a43df38a..34eeb054c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c44aff38eddd344e0bc3d51ef911fe2a00d42afd76f5ef515df9e10fbe2280c2 -size 792203 +oid sha256:3f865e3378ef7721d54d79a13ef0790fe48c96851598c774a636f0e6de89c4b9 +size 736440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 10a2730b30..464c90f1d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d6627fe8ac665f0ba7bfbd3e55d1cc0bd5bb8706f666293e53222171bc7b43f -size 674189 +oid sha256:d9289d16c33176b765504f02509d671795a6ec8ecc71e3360737304661b0fcc5 +size 632930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f643b14525..36c78b20d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6243a73f5050524651bcb2336dcfeea896dd58f1cc0f4ee039f0653c6fa0c7e8 -size 743607 +oid sha256:ae797c2aa54dc94c6a58c9fe634f1aee1267b5cf46881fb6b87ed57c7631a796 +size 695540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e426b16a0b..b7dab0b4c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87178a6a45852c0e68d29f8812743739367cde322cc338cbed9dbc4cf6452db2 -size 634819 +oid sha256:51f2f062c3738daee55c1d519cd79d297c1ec8bbd2b928d88ed6239ad2abad6f +size 597703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5ad7611a5e..f35ba6d822 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9a6c55bad24fe81417b7ad58d7d1030b60d64c1c09df3ce5aa5a21ff6b44ff1 -size 777917 +oid sha256:ac75c0bc5b4d1afbe28b2a7155b37d3b4602fad22eee077c80a58ffbdaf4d7ad +size 731922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index da9cbad0ce..ce2d782b02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06c9a6a6bf4e161a3605b9a905f8a10cb52665e0a920898720507a6566cecde6 -size 733761 +oid sha256:b15e57ee319fe4a84df2652274bcb76b75d6837e369c6c0bf3cf6f5f0c650597 +size 692304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 442fc94b53..6d40159922 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f695541394270ea6f63d5be2a663eec588df664dd0d61a935a7c73665a1e3a1e -size 763455 +oid sha256:eb8ffe299f0468507e537a32684411a8c9a99e32145ad27359176a1cb872079c +size 714304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1f03e6227c..e1e0672022 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20c0730b6ca6cbf9f83e3d907acc9dc4d39aa7b6512e2abd1597a35f4014cab8 -size 725417 +oid sha256:72ddc5ff37caebed3c6df5a1d85b2cdf365952eab3318a5d7c3b76ab6f322ba8 +size 682038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f33689e9b9..86cabbb503 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1c9bf82c5c7bb3949bc4bc11b7746bd86faa47c743d81bc6fe853bff53ae2bd -size 771977 +oid sha256:952789bc742d56985766c79db720ec6fc52bf656c7832636f37bb5a56fce101d +size 722972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8495cba32f..3bd60c5bd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14668e8fc8c723316f70ea782810fe7e297611a6a402a800e5b7115b0de096b6 -size 654851 +oid sha256:d76824c69893d6c559acdac4a60fb5ca079b9948e6be42ae91abb0b4b1c5c033 +size 619462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9a64b20cba..0558bba134 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daac426adcc8393a2989de79446bf37ec51efb29e14fc61cb1cc41921c7ae4cb -size 724171 +oid sha256:2ae6181fb779635f8a4e773bf010f6d8aa1fd41b1c239cc50231f4caab92f2bd +size 682024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fd5cfe280f..b0dc4cc54f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c725b82de1ce3fd12f9fcdd1babd608611da73df9805314ae3ac863b03f0cce9 -size 616220 +oid sha256:1d79b09a3322ef05da39eac085da3aecc05e5a2c4652cdd315e0b1bda880e50d +size 584235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7a05b2dc77..beebbb3ff3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61a40febe1441917d34d186229cb895815b8dee9416510d78539f9e7e1142674 -size 740785 +oid sha256:baf379c9d1ace4ab2d900785e2dcfdafa50a9aecc4112f671526b0856e2a3490 +size 689166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a3f5c4e2d9..52d22f3b2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e219e2210271119e2e51049824f1a1b3cc2ba3cbe21afb97970900b754f78efe -size 721243 +oid sha256:51e459e0de007566db592de77d52a431b1ef3d75d09bbca621f9d3c31076d5ed +size 672092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index f1d982135c..d628334015 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:507226843f18a95d4893dc8177411d365947f73471910ce722ad226c441f2152 -size 694097 +oid sha256:819d3da3d213fd5670e3e3b06c802a6c60182d55646f90455cfa482bc06f7a49 +size 663494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 145a1f5803..be2bb74b61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42c12ed82b8ca6a74ba23ac7e8bdcb8084d4573789f9c7bf56ead0cf77439b43 -size 607310 +oid sha256:fa224078324bcfb0717980f25175b8a2856ab90606db5de70eb87d1a22d977a5 +size 578681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index adfb83d835..694bcc34a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ea35a45a4ac2acfacb4aa9d8e205c39ee75f51f768761804175565389ac9a4c -size 731117 +oid sha256:b6f5ef3948e7b9081e554ca7f67ec4c09f524602da4cb6ee2a5cfcde508460af +size 682014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d92e865264..84c46dce55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f0dc940f61c863340a36866b8ac154071452f3eaef99d9a36b34655af83830b -size 711575 +oid sha256:bf9e95a01009a2367d51843984aa6a4ecdce9e59171c1cc0421303e71d3050bd +size 664988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 5445eb71a4..df9f74d3f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fd56be561db7d48f4b95f2bd667015081ceab9e92412e4dc387d3945aa6b29c -size 684329 +oid sha256:c68a2630f7f91af30364631c8bbc6285f2a4d805f3aec1dcced8d5c4af572f41 +size 656340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3cd81269b6..ed18770279 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b9c5a49c57b2d1d38b4194cb87f21dc8ef6f1f08df8ec75c4074abc849be3fb -size 597592 +oid sha256:24c9fe502d4eafe8e4b8d1a95672fd1245cb1d2199e64d8bd323062435b42f60 +size 571527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 01591731dc..aeab299f42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ad49680f1908e85455e1e58b1c098956f9ac2227dbf67c1509ee2a6a4fe5618 -size 640241 +oid sha256:8e21acdf8dd9992fe1214ac5c337cb667783188990b2d0763c5c78f26a1224b9 +size 606973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5954a3f814..9f99183ba0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9272dd514be222420dd1797eaa73976bd15bab9f64194190661383e7d2d4b5ec -size 615078 +oid sha256:871e0e16f0b767ba176ff66ab0f05535957367d6e3c4f3f99a7996e281838572 +size 583685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6385e7a974..1d239fba72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d847de715a9dec9d3e331d5e655556e2941494c249052ed1a46a64024db433b -size 633969 +oid sha256:d7c6668a9e4fe1c09dbd85198c14a872532ad45c775f844d3d834152e7873e46 +size 599419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7d253e3c7d..2ff517dffa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15d514aab9de663ed99c84cbae4fdb330c9afeb8ea02b137ebb21d70a5371a5c -size 608806 +oid sha256:f968ecee7d3cf177a272fdf898e00f2b296cfcd7aaf67ca09ab06c4662eaae79 +size 577365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bbcd856f6f..6f94f8a713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4b0a79fc39bcd349cacdb47c77b56cfe725632a290dd6b2f8ecd8fdbc60388a -size 627985 +oid sha256:16022d01808a8f3e16b6c84ef4f9efdbafb023af09e084bf16482b324c1eead0 +size 604091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c7e4152e16..129a682da5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21c1e5376d227b62f41b1bf30612e720b54826c1be92fd560f06e6ff9819eba8 -size 546972 +oid sha256:59d4e910434eab269dce8180381443f789beafc2d3ca585733519a648b3da169 +size 521351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dfc08720be..29ad2e22d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a328d43e959aebdca86f1eb215fa7463bacfc8fbbe0463245441a292b7bf347f -size 599270 +oid sha256:3e01e64cf8bf1e083c729166995e3deb93ba0b437fb2cde1d1ee71c87350fffc +size 578485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1dffdf744d..cb43b93474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5121086bb4cbf5f4966ac470c8bbb8b1c1c912844fcca2f34b1c4c6c02c5d471 -size 520478 +oid sha256:bb029bc118f9d92e6a1fe171e902d3625815228e0d4dfd54a75fd186bcf029be +size 495893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d9ebee924b..86b0b792e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23b6afff3d8f187d5d6bb2f4869251af933e00f4b7411b2fe7e6a0078b3e715e -size 630521 +oid sha256:10390c410b8a93f9c633d93ab9f94c1c644dbd6851387bd1225e8717c18a62bc +size 599819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5ee5d18176..85cc457571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f96ff77213ea0f129cd2d9e9f088d915042b94a77bae6681e41ea688f5aab219 -size 605408 +oid sha256:dbbd429a9d77dd1882ef250eaa5bf9682d89d42d7c87a075e2d5c0abfb304f1e +size 576531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 52845594e3..0c58794288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be905bcdc9aa047452289f7c6c05334d8f8d3de7639ba15c112b413b8ddbaf0c -size 624251 +oid sha256:eaf160690f65ffc20eb6a142eac1a962cf863638b32a737d796c0cade55b923d +size 592265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c5c58209e3..225f33d4a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09824a0a5a2d4a071c821cac9189a14a80b2a880f0bb7f6ee4a75fe69f806892 -size 599136 +oid sha256:42d772de21d11c64dd88541d646726886d976519267cc5b145dba3638d8ac01d +size 570211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 40d6ce9060..5b182438f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0361bf319ad760f72aa5dbe92f25637c05589c29d8786ef120cfbcf9d47473b6 -size 618169 +oid sha256:ee1dbbd138a5bb0b919a3b4c0ff7a13a674818c9d8f2ea171dab436ac6da43fc +size 596149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e26a3ce4ce..7a7ce07366 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af7c6cf9a5c185265e040d77623ad1a0491ec2b2286d173178fc730e4c466332 -size 537302 +oid sha256:2ae7326a7e86df31c220519d9bb9d90a87abb3cf2b2cbd1b0547a8a795d63588 +size 513409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8d78da200c..0636c86e01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b86d727065afa48b4f893df23becc7d8c3ca1eeff11dfb947a295f5683b58d7a -size 589650 +oid sha256:0090882de350826b49ff0b4d40f1403dcfbd631472f0956b32bf68e9360577ee +size 571431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c7d094bf8c..6288cd7349 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:599a3c44e850078573f0b6eafbc3828e46a228f832763f865ccd99e8f28253b6 -size 510808 +oid sha256:0f3a8374aef244bca8f792e7a2a309cfdc643ac2d239c0eb0bf58e793113838e +size 488789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b4f6e2db23..4fc5d4f760 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa12e6fef9dd65d050c21a0a49b1b86e1ee983451d13a4caaf35ace5109ff4e8 -size 658627 +oid sha256:d6bfce5f8f4f30a86a27e0e4c9e10dc644ee476fb30c9aaf2ea47c396c2a368c +size 627234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9e6eb46b26..af4670c8ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca9d848b8e2b40d0bd20d1979fa022ffa6d91b7a611e8b1f277459898ef32cfa -size 631885 +oid sha256:97ed6d94e45d864933661bfb357e8e45412e800567e83f0dbd86e88518447d5c +size 591809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1a3cb6ffa7..36cf05b8dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4f275bfb152e619b9e9c20917621cb6b9de1df70b8e6b675213892125887802 -size 651565 +oid sha256:227598592c52b2f4e2cfc1cdce5a494487e25487df0044fa459fb2548b285feb +size 618102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ddad703d53..6637966fd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc21ac169afdd19293c0a9131c7e9a786624c00fa52245b67f8049eef6dc5695 -size 625615 +oid sha256:b7bfd4886f8fa6d90555bdd3c615f4dc6a4673c52a00350f6716ba3f270bb720 +size 595257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5d0fdf6353..894b00ec46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47a28d55893896c5be37411cb807d2dce0ab5d77fef03f557e69d094e3218f40 -size 662751 +oid sha256:febaa7562ec03da197e196ad160b457b017674e52e48f95f09b72881af8462bd +size 629928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2cd10b41f2..1ddb2d8e04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:854a1725b94c8cc21a2574a7d7b6e12949db4679a7084ae565003717231dfbe5 -size 558254 +oid sha256:9cba94192fea3e549576a2f10132c4a5d09ec1ae005e3e17ed100d3f0b995fc3 +size 533127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 50fb056835..46efdeee3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4d268bbe5f6d102a76326a38c02256d320ed9610ea3ba48261cea08d3a85467 -size 628561 +oid sha256:29c0568e5d3d0aacadece4d686b04b0d6013f759b3b66d8ad1ba638dbbc408b0 +size 601163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 22739fb14a..bd8a1ca6f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bd5e0413b07761f14cad5af4dd429e6464a194fcb4edd37ddf3f33d7c64b7fc -size 530180 +oid sha256:3fbfbc2b2af1a77b261e51ebd8f0f74031d69183961a3813aab3bed309b4bd4d +size 508457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5d6c54df0a..f47592bbc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dafec6d927df7742516a66d22c61b7dfb554167d170b51fdaf15233935dc2dfb -size 648907 +oid sha256:af72c09ed36c5e16ff1ada2dd701b7539bda1916a7279e9d12a21d84a94bbc2a +size 620080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a895bf49ed..dd2aec9dee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcfd64dec15c7d4b56f01fc2c194aa818b1abf2a87d69f0afdeec4441a7f47d1 -size 622217 +oid sha256:4e75b1d597aa47f4a403f835c0f62eb5b218dbabfd7e6703d923ddfbd4b85a24 +size 584657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5f78d09c70..dfba971249 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bca479ad6f27116d5a4b6547cb4ec715154bfd349564a00bcff5caf8074f733 -size 641847 +oid sha256:14b48d0aca05f8c9eed293c3e64214f13708f1e9a21ce7660fe9dea7e3874be9 +size 610947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f5a5a24c86..37ed18e3c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a40f373c436a7fecc496aa74911106561cb1e9161873bd414cfe625761c6f7b -size 615944 +oid sha256:a805c4014d372b9401b5de7337eab32f63535e2d26e660e6118cb2c47ecdd7b1 +size 588103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2cf244342b..f810f849e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90438744ffe4e5014e8febe9f6d01b0b018d151b9a73b9da8c1d7ae5b6f17736 -size 653031 +oid sha256:ac7007edf3ecf789106278776a38790d312eabff1cc82a42e132318c76ebe900 +size 622774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8d59477bd4..352e482282 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12a780e438d3e3ea9b9f0902a7d0abc6ce4d7b4d250397fca8c764e3357a4905 -size 548584 +oid sha256:1dd3eac692d4ec422767a04fb0716a0f21489020a93418724df0db3f031b499f +size 525973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a79840347b..a1cead46e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25c3002fdf77ef51386ba8fb6dfc58894b89744a6d197e09d6842f83a85962cc -size 618841 +oid sha256:0cc880310d5c632969228baf591b094fc5df505c3b6398453170a03734a623f9 +size 593221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 379fb9f767..7efb665995 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:955c04e1d661f6db2221d7629d8a8fe028fa29473f0fa48bfc563c2dd79365a8 -size 520512 +oid sha256:b4d780669e215954f74178e8c9f491871eff44c18f2dc011735aacee2729f168 +size 501305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index bf3f0d3cef..47d64aa73a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d20f35ad9ac542b582ce03f038bd289f562e38c958376f055b7fa7aac56d3ebd -size 840293 +oid sha256:606f685d736a06bdd31582e713700fc83fdc4f1e1ede4175232c8e70a617c407 +size 794840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7116038ddb..77dad2662f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a10311082af84a4d0e63024024c1f0aef84fcd9fb710000255f23157b714e02 -size 826425 +oid sha256:5cb7e6870c8c0598ed3c7a1765b357ba2280829ab79120be9024220b6b449268 +size 798090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index b29de9fa65..223f7b61d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4521674a36fdbb35747d96debacb277c8393b668d2c98253ad2e9aa29efc33d5 -size 790299 +oid sha256:57b1a1f07a529659dbcd38fcec44593e546ce57d675ba5d120d245160013008c +size 772424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4610e66ca1..11af8e78af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:851b0b8bc2e8654baeba7afe76cfdc89deed21d3480254d677a58d5b66c7148f -size 697791 +oid sha256:08a15482b017801734801c968c48d304790f8490fb3bc4aa491d457c688618fd +size 684898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7e7dd85182..dc04e6038f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fd61f4e99eebd3148d9366d272b076f5529c4a63af526590f8dc5cd2180f467 -size 817945 +oid sha256:7130ac3d586e099ef9b1e3db3bb59a1d917be0a102a10ded64d2c0e3e52277bf +size 780584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 198a99fb76..84f8212f4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fbe6cfad9bbe0eaf02da47f21bd221be155c856314d372a6539b0e33d5f4579 -size 806099 +oid sha256:c1b0d494412b506b2dbb2f076d9de3d563b8bf683d77b29f33fd9aee3754f24e +size 783834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 9bd4ae57bd..8e25d08d67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00ffd59da8673003e8af119128dda5a7b75d35cbd253b36c00153ed9ea944b55 -size 768247 +oid sha256:0ca7c88c87f21ce7d38e3eeb8aca8a43a75e7706125f2edfda516a9351414a60 +size 752690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8f9cb4ebf5..3d54329823 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8a5cec5f336bca75470c096e34639ca585962dbbd8d04ac0d71aff1185d1cf1 -size 675443 +oid sha256:04914e3cf7f1e65788eb78c53782b1e67cd4e2a898d664561535363b4bdabad6 +size 671380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bfec09c3b3..e958871314 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c4ea89f6941203161dceb4f4a978d57491b4cb3162f7a31c7486884f4edae6e -size 717745 +oid sha256:626061b02b6a7d1710ee6feb026ac58287fea596fbdf89552cf769f93f822a03 +size 664054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 00a311fe62..3f043e3693 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed296870fc05e8b5dac689267d7e4b6d2cc279295a0fbacb16961c8e7cbc92d0 -size 668409 +oid sha256:d41183f5104da3c7b87561576a368c1eb47d2ab5b47188933436e0d4b3f66694 +size 619702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ebf86d806..7f5ebfe560 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4171e7e80a38331a5f4dbc614881af57c40b9a0ba8eb317fc3d6f0470d5e2eaf -size 707133 +oid sha256:99efc5b0602c34f722908cefc5426df97d447837dc379326ec3e4dfe90a4d7e5 +size 652208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0087fbc19c..d295d4446e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75150b1f487d4ea3ba05b5f80cf9eddb988ce8aa5f41c4ab3d8110ac44b7bf07 -size 660263 +oid sha256:035655df2bbbb11d858a24c0469d49c0d9aebd03e4d9e7fcb23c4071387eb711 +size 614267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9e732e7fa8..57ba8bff35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1995fc9e0cd8e8e569f3ec362ed12e2de1698f4fb2390bb65ceaa035e02f2b42 -size 712495 +oid sha256:40c9240df5ac8a51330b6e58f6e426c5b01da23cb881523685f5276d0ec24e88 +size 659496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f209b1f67f..c98405ff87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b490823f58aadbd0e01c7e7e53817d206f6fbe47f9f1065b823918d4dd96fdcf -size 608294 +oid sha256:5300cf7738deb8f5645a65cf375e65f49eaf774354fe3ce1dc048d0c8d77f61f +size 564667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b864dd4a1b..04938928a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ca8485259025f43caea53ba805391c4098641a436add05d8abf497aba568900 -size 648261 +oid sha256:a8ab798d2a2c25d3c9f3b77f185b61c7429f55d1f29cda17939da308c40bdf68 +size 611837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8d31e69d6a..efb873b64f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2dff9d8505939e6abb3f67a3c7efee0906c39df05c931156ee91d6cffe13070 -size 571046 +oid sha256:c434160dd1ea0f8d44d41e7e6dc65532c7f373c1e103de4b84c8860bdf70173d +size 530429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 523f20ea19..b29a9cc43d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2806fd6f2a072730df9342f926b8b9bbf7aebad1489762728b0c81df56ea2ac9 -size 693917 +oid sha256:20496d92af02f56fae6442bc49879073c42a37f7c91c89782d58af1ab3f8998a +size 649796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 350109443a..10a06328cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3cfad2bfce66d3504b81254aecdffc4b68d6491ea46bd40abdc18663c3ea0929 -size 644581 +oid sha256:fef1b6e12149c86896c45b8408b65eb9f41271c719177afd3d0d51224f0e1f1d +size 605393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4aef8777fa..55dd46a525 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6bf8c2cbed339ea64eb245f4f702a316813b51f8ab63091e27ee89029a76e96 -size 685179 +oid sha256:5e5e4bd6d8141540ee9ae01ce8180030bf523be40e46c114b87714546cc92f2f +size 637902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d54a6ef601..3b9e677287 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c9ca81603df42c4fa87883c78ebd193f82d18d6635e16e2f6ddc9c6eb9cc25c -size 638311 +oid sha256:ffd00eacb4fc0cd2d74461e412e87fce2f86d4dea3595d592ffa131640711110 +size 599961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 224a38d383..cd582a43fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72df199c25a0975799ec8e9605c44b3182e5261272ff0a8297f758a32dda7100 -size 688519 +oid sha256:44d0d3bf5c444c275e820cc002363c91956614f11c4974a6329134e0ceb94f94 +size 643610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5828e61d1c..d9c320c19a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24d703e3479a708c4f4038ab22ddc055deb1cacfff008c60ac54042543a8f635 -size 586934 +oid sha256:0bd6880ff6fa5bff3a2d496f699a1994b004e37b962dfc30d0852ecdba812402 +size 551151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1d568414c4..f31d928846 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b6475574bd3e24b7045f3508686266f78431d7cc1488fa02c4a0036e28ea907 -size 628825 +oid sha256:78de4b9711fa99c1a7d13347a19b6be419e0ca487e5f592cde83e36eb1f3d952 +size 597431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a1c7ebb50..9b42bc67ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bb5a3049bb2d0f1db359f63829ee730c81ffa9088ec66a36c2a0c84cb25a438 -size 549784 +oid sha256:d17a3737fa72fced2405075fead338240b1d323d536b67d053fbf05f8488168a +size 516961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 004f7280fc..7fb42e018a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e420ae9f6d5b2e4cd8e79a362e7f548689c054288bded9926e90a17f086d5986 -size 735341 +oid sha256:60b69fb9cbaafe0d876af19d05eef311fd92664d01743150779dae7c9bd1029a +size 683970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1554d16b19..10e4cd94de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59b421d332cca18ab7e8740d7c74f04e4e8987725207319bd55d7ddc8863c23b -size 685217 +oid sha256:c23ad54695b86ee7db99b4612d1a28c04de42dd0806e6098733289b2a5c2e855 +size 626444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e34694f001..8b45acfc12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db81e3db1eb458ccc261131ed0cf23b52894a41b749075ba7e5a56db420b163d -size 724729 +oid sha256:89b7004bfddb8bef16bebd3f836f42c434d5002b4ab0900460cd1e806177354d +size 672074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e86f922313..783dee8035 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c65a19d17de4c6d0faaaf1eeeba584af9d186ae35134c240d9921f4f04b86082 -size 677861 +oid sha256:7d52ed0cd252532762d564503f9ad1223e878db3a0311689f2198ac1cefb46c1 +size 632358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index eecc93e36e..b184a86aa8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dad7a5d76a5680aacd0a774825038d9b91a7401d16c8affb0810a9f6876c0e55 -size 731325 +oid sha256:704b3ae3286d9cec3dacc87a954f7f6ae326ce0a707dac11fe5c74df780117de +size 676500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 890169cd9b..b3ba3ff243 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f741dfbd42b507e2da27da503bf7d6b135cd3dd33d0d5313b852faa41614f6a -size 608280 +oid sha256:8f25d56fe17d3e8b8a93b0c9621579969bcaf49878afc47d5e9dd0c5d5eb8d95 +size 570671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 73e315c7cc..a355a91929 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b39e0665c230267b3b2da9211c378d65e0349fa9bbdcd6eaf72519af7096f3e2 -size 688799 +oid sha256:ecd09a761a3fdc14577c1c7021cb48c99f876730f27c4a9a9333c5cd115fc3ff +size 638166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 185f1fb78c..d18b993156 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f75e03eb601d62ba2365ab39e30c25e8e76001fee3b650cffce2f3c67deed000 -size 570438 +oid sha256:4edd6630deebd5b39305f490b69e28ae2811a91929e6597653fa911b4e79e9cf +size 536579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8846e9f9e7..a62a82e5e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c038eb0192acb5b6e5ffc08ef5611d58548e5ed4a31728ecd5aac2d1ed58e94e -size 712303 +oid sha256:77aa66ab3839bfbb43841835011b9ae356271742234909f190d8a22975e7a329 +size 669662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 05231ac0c5..29b1b4ff8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f1745d310b2df730824d42ef8d7085aeb8ffbc8b168a164ad687157bba74ab0 -size 661389 +oid sha256:44b35e497c97ca0caf819dc9d84acb555dfca4cb34f6eeee932206ccdc49fe2b +size 612137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 03b6b3ad50..69b3a0620e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:686b2955b9cbd0a2ec784ece262ca8efa6f2a22990b5b3f4b98fb11c7350bced -size 702775 +oid sha256:07ea519efbbb7397b21e6adabce27b9dba9cb480bd1906181e82349b7fe148bf +size 657816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8318e8db46..274d6cff5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c77dda7a7126c2d238b164ed29c7f640cab09d8b6590d82e6685513516da7db -size 655907 +oid sha256:c091aab125d5380a5e394119db78ddc293fff0f209441f8bd9e29b0d6cb299b2 +size 618842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d9a429a9e6..7f11fdc4a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68ccdfb98f68ac96d759a385c010a8d7b5db29d1bd412eeab0c92d568002ee4c -size 706461 +oid sha256:443de1026afad9f9714047f88ed0ba8c7a0e097ac4bcc484a4f6e79b09c613a1 +size 661404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 22cffa11c2..db566d609a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e79506e0fb547c023e92a402765fa343a95276bd4348ed3a2356eed516e8a61 -size 589680 +oid sha256:bffa58854742b4d84f64b4f530fe26984549779f60d7e15206ec6c787dbe7fbd +size 556365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 71fb0eebaf..e4b55fcb07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85da87c6b35ab9020044e1db0902f6d0fb96ba592cc63f17123677629f64933a -size 663983 +oid sha256:c6b2ea04d7aa17c7f9c83ebf7c8503a60f769941b84cfeac2d7f5c44b8f7fc63 +size 622972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d26ac395c..7407ea4259 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a575f61ad9adec24813fb289fea5fb6c906c8a549e643b089b2252294454ea2d -size 551890 +oid sha256:00487c110e8b3e757ebed29ed8638cc9f546e5ba608956ab06ac498eb6996ea8 +size 522323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c0f28f728d..8ce660d03f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a51975a58b59c85c994747724d8acbec6b48cf514fc56bee11ccc4b8ddf3263 -size 886663 +oid sha256:e5535b93aaf7cb1dc2aee14f5e47d20f45b6d04005090d26668bfdae6bba737c +size 803026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 460dc2be00..dfaf715b52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f3b42af7211aecbe74c894f9633726ef11e217efd84ed74b8a0aee4f6506108 -size 843197 +oid sha256:a514a55467f87808d7205ea6c7ac77ad711d56c7aed7f144344266bd5e0d383a +size 762718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4b282884e1..bf25f3885c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eca4dba678057806029f12e0234475cb0fef104f62d8b3d5f96dc298cd447d2e -size 872201 +oid sha256:820817c5c766e8bab706dc5859f304c173ddc1d997b09a0f0b1b25eefb07f392 +size 786246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index aaf734d787..b2320df94d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4a878ad4aa782847708e301fbc05412c03c5165b05c372542e8e7eca9080f0b -size 832487 +oid sha256:a0ab27b58665f195ebb6ba52d1bf712ef5f87db8e7b5cd4611ba6574c73721b6 +size 752402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9302725e97..4e699e07a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2558728a0d57c4a7794285133cff2741966308414b31d9a6f1e59a376843e3b2 -size 863555 +oid sha256:8224be62ce1a873bdc55ed10699db2cd11bb4cea61d7df4342d8fda71811d5a1 +size 775922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86e761c24a..87fa721e49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8021ef58077360ce8129b74ccadb4099d0477857249e1346423a3066db3ad25 -size 775339 +oid sha256:269b765cb9003a2f79ef74bf4ccbf583b44219ae69f8bcf884eadcb433698d7c +size 700878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a323155445..587bc815c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d3439d6b9149b3c95753062b3189ecd9b9082a05506f43ed0c359079ea6fd14 -size 794387 +oid sha256:a23161afc4ccc5742497b5b80cdd91ae3ad7f85a3a1899dd475a2cb3f190dc6b +size 724070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 487f12fa65..cd8ed13e7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ab742c03f42c3f85ec4035aebcf0c22d2688a67f2a2c1ff417673f5721e4685 -size 735771 +oid sha256:a1195b2db67b49d8f1a0e5c3346d65e2e9dde051ed3b8147cee784ffb5d1738b +size 663826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f726aef88d..d30c543218 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:267deab746e9f7e165b57f88a45e211f50cf0eda3e00fd06e5bf753de396c14a -size 841177 +oid sha256:fb46da3d080d7f953c43498035daf30b485f75d7e85243076b7ef689d217ca3e +size 776040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cf04903217..26f9570695 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faba40112f59a121ffccc9980f3a4fef85c4e0942d3353d612aa0e7ddaa87622 -size 796923 +oid sha256:902f08ac29949b88352ba78f31aac2c56b8242030ef068e0fafb0a8a5e3fafe4 +size 735684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 16dba1f987..e0d9ecaf30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8abc50d79ce371dbdfd61b2cd6b55a0edb2f9109b1225730619eae0514f56f2 -size 827555 +oid sha256:563c90b92413dcb25722a6d4f854ec44acb883530ccd57b97d488988979d14ad +size 759262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dc05bd1e57..710353fd2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5927ac97240e4b957f2f80f1be4da0093d411098d674806ae8788436da33f80d -size 787001 +oid sha256:a7a73b533725746dacd0ff671c4590522a8a377c3e2c926d0e790be6fedb4acf +size 725416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 06d9b048d1..eb7921974d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:754219916f4e445d934b23113c4c06c85994323885350a38b06cd94597313265 -size 816491 +oid sha256:6021fe2fdbf4eb0a311488b2c3cdf212138ec301f99a134194f888ce36947d65 +size 748148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d44e45e7c..2d374b837e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a908b18a3ff15b02476876f3e6f4926f9d1be524f0798b9b8464d5998f8e6d0 -size 733455 +oid sha256:5387e2cd68fccb375b50180395dfb77c17dd2515e094eebcf93bc05d3eefdaf0 +size 673844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e9529be362..a052d5c28c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92fe38f4b407dcfb4936c6173bba89cd6064f65f68be425244d2708381fe7e97 -size 750777 +oid sha256:bc30637d7f132dc91b0bf413e308278d95c3709e4167776e01c8d48be7dbf443 +size 696346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2d1f282e81..8c5ba350c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a86ef598ada2630e2ed00ef2c078162edb31a52a4e6849fb8e325849f0126571 -size 693097 +oid sha256:b82364167f9dc42a5d7ab18dd70e8d50b6c77b9303e3cc100b52507f20bb9436 +size 636842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d78bf2c244..d39bb8e190 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7647dc38acc8c80740e9660b3e312ece4fb69eb0b68f1f0492cf3561a448f1ab -size 907071 +oid sha256:0f524b77167c40f57d39313cc752125b50c47464067efeee65e1f962bc1f73db +size 823582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f58404f48b..33d632609b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b34ea6c4a856866b9e1c953a4f6bc8ced756a20c8b256ea45ed0edbec82990cc -size 861979 +oid sha256:6fc18f480c85c9f414db86d6a07a4c99ea1055f9d644b21bb736099b7d57d4e8 +size 782240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7845314d3d..a577a9da03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce98c7a129b45717f1d46cc52bcd555995206888645415cca2b4680bf6996351 -size 890933 +oid sha256:429ea9e0eee4a96ad6aab42a13b633cc2fde1ec7202655e86d73e2573715c1bd +size 806014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cbf96209b7..1fc8ddbdad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16d687d914a648ac107db4e094d051e2f2b002a08d34aac2b73e9a9f4c5e7fcf -size 851169 +oid sha256:acd25fbfda4686b07e834c73d15353ae150395756de1bb51e761f0ecf9ec58bb +size 771182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 05a27a397e..5f4cad5de7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78747fbe7392f7a5a060e48ad0bb80cf34a99748d1b0c9af0b3eec28d5a54ba4 -size 879917 +oid sha256:6dbede98252b400f8351425edeb2d443617aca01f4e64240dbca2e4d209b72de +size 793518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9b33b03a8b..f27ca5a918 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98221acfd17931e0a55a9a5ad875d33885214972344d2fc9816a38b6c4f023af -size 752531 +oid sha256:f7ae52c720dbbca6e99481267234669e30b7dc4d601795b84ff782140dcd53ee +size 685520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1ef48e5d07..f20f39c4a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d5ed8220701313a90617da0cf00f046c1a6fce7c8754cb1a9c12818284091a9 -size 830483 +oid sha256:d63b4ca5272a9d8f0f8023bdcbf49fef4e1298a3c8150e07fbbc7351fb177d45 +size 751978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index df9c355f0c..e3296acf07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a718cc6658412c980aa26b0e69807b2fe12906156e449399d02b3b34cb11207 -size 710695 +oid sha256:bdc43dfabff702899830663561bbab9a5f0826e3a517739d548fb5a6e9d3c364 +size 647038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 48c8f889cd..5773ca1e23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33901e920145d697261f393e6a7677d4b17bb3c9371a94510d1d7241e445e0d9 -size 859613 +oid sha256:052214f4669e9a8228ecaeb691c8e897d8be0af3cc24e203447aa9b61ae02a88 +size 795808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b5f7d40500..329317d006 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d48259b792792a827065fa3924b44bda5e6ad1cfd6db9d958d3ade9723c288b -size 814519 +oid sha256:a5877eb1653c44e6461a260b3c9048879054453fb1b6795feb78449565d7ff43 +size 754464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0992900e82..8f8288bbca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffdc79791c0829748ee7170f7829713922b9df33736909833736a342e29478f0 -size 845941 +oid sha256:7a1d489851d3dd6bacd27a392e664c8742abbdcd6a044a1c2c986676ce703472 +size 779028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7c1c51784d..f4cc77fd8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5058eecc0c2b0bfbda6dd1aff477ec0e859a6b3643323635926ef90d5f4b72c3 -size 805387 +oid sha256:9ec59541942ccf5d750354249695a23188fdcd717ba613f3cfbd8e90456f8d09 +size 743358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 076181efd3..e3fed5b2f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3bdad8fa11add5d6664080231a07d7f25895dafe463b03b4b98fab77c554024 -size 834729 +oid sha256:07017b1efd90efd1d41be88fd8869228b9f43b4119e713540a140ec6c7e4b74e +size 765546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 395ee8efe5..d43ce2b940 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c97dead7b64d08db9177ff81e0380fda05694b3da8a468e329b8aec8bf3ea1ea -size 709117 +oid sha256:116e6978f3de08062169f46c7b3bfc1732aee711bf649ffd3442702582ac9074 +size 657696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0739733ef3..48e3513d6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bff113cb72ef24db6474f4dea44131529557885d9439ad7068e4de7a2279d985 -size 785295 +oid sha256:21cd5355d64765e34c173c49bdb8426a1cda03b5cb735fb498b7c511274d9d1f +size 724204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ecf4678b91..1f7409fe2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:503fa4334b3ae1490c58a18e93e2b4809784335b380cf662d3f7ab58ee308b61 -size 667331 +oid sha256:de8b1aacdbcd1b1ff989359a42a2482617ff2b68aaeeddfbedb7808468e67c8e +size 620052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7edc7cf94a..505be646a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ed637c0e85f884d145e96e7ac872626fc4a6c249e09c5fc2c997bfbb917d45e -size 759237 +oid sha256:a0f8e8ec134f367ccfd3727c5e953b615bd6c746c1a9fe27107f9518cd21348d +size 710824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index adff264b36..c83840b9e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33a1959e9106311b3ab11bea50b6f6e2b8310c635c9b2ef3ab53ff1a6f9f482e -size 737819 +oid sha256:19e1e39d212dedd9d59380ca5c0ae9afc9e98a7004629b34691589eab153f56b +size 695130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ab35eff934..e1143a0b01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f677c90b26bbb46e10ea2e9f4ee39760a54304b0fee30bdbd14e13ad9e62ec81 -size 719355 +oid sha256:10ca7015dd140921c2f77be0a680222e78e42c9779efd570fedfc6fd1ca237a6 +size 684066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a6010ed6c1..34040f9676 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25563ba040768721162d6ace9c21cb699bdd9dab779f74e7fc19aa41d85e41d2 -size 638193 +oid sha256:88778d17ef33f045eefc11d2c6c1d4b664212afd6e88a7e90ee9f9f0681d89eb +size 600831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ecc175869d..debfac15a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae6f6b68ed78f99d8bad1651f3021163254bfe0174dcb5377acef564b161f4d5 -size 749567 +oid sha256:4f2c4cbe2ba1829bba5c6d25ab1c39915ec22202fc97f0070984bb221ca9e6e9 +size 703670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 51950457b8..ba2923a02d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c39b7eb1ebcf984efe301107333c226873b74061a3ae70d9394baf5c4119c01 -size 728151 +oid sha256:b136e97f82940df9fc3c8eecf3019bf48405d622381d8f08ebd815cf55ab45a9 +size 687976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index cb7f3449ae..4bc5a08abb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3955fca73dea44268499ad7589040f11b332fec72f99f3d0f4e9219b8596df8f -size 709637 +oid sha256:0127f7d8cb565485e1310d69f5ebb2fffbc94ce07ec2979268430b82e349bf76 +size 677702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 675fcfb376..cd37c9e4e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4c2a3bd24ce5f1997daec93b35f9154c2f4d475dbf630305245fe4c2e06c460 -size 628525 +oid sha256:476734a0dafdb2ee2cf547642ffb099d1996a3fe183552cef3b490f5c4424e0e +size 593677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 682dcb5748..b5b6d3f52b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a118e30aa87c197db86525c9eb4bcaf3a1906cea5fe21467ae245fbb48999462 -size 656471 +oid sha256:5492eec529f5e5cf15957cd5099cc256289944e4d69b94738f291d50aa1a2dae +size 618862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 70ecbe8c28..832ae9bcde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f68af87ca700f3e5a7894a473f70225d41c3857e3459d4d148ac5f80d421fac2 -size 629631 +oid sha256:be276683af123599209631afdffcc126536d2432708775230719cb1fd1d12ec7 +size 593947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aa9ed4b9ea..9b970bb3ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e796886034fde935c88827d2b784e0b107494913b2b7ad8820ca82f2d0234c0 -size 650199 +oid sha256:28a2d374818fe9343d1c4ab2671e8c52c17e5201b5ae0d64a776daecdcfd2b11 +size 610519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3cda604a9..d82ea05a17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b3a01bfcc1453a16ff6b2bbb55d5be943b5c17e0fa83e27a952f060e6d37af7 -size 622571 +oid sha256:40d7f576f79fee92aee035c81fc9e843ad4c414f2bdc71eb311df14494507387 +size 588415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 59d7f1e7f1..848b1af356 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:244c751edc557acbef19d8b47a73fcda63566d13d780ce913c267f4ab36f6fdd -size 644167 +oid sha256:966e87ea38acea992ad167db1524e6042bcf75e8c52b955b68394bb55711300f +size 614797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index faea4182a2..12dac1ca18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:538ec56df72f0a4611b6255d7f75505f78a7b47f3bbe808e70370a082588b955 -size 563300 +oid sha256:a8014d67b0a55de075f3652e39af9ce96b9c3701f24d1bf2376eeb6b2f153b0a +size 532105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7fbbebc9a2..e1ac08bdd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b2328b6f451caf2389511f79c890a3e85ad81cc5b42f37fbabb21240a9dfe6f -size 613824 +oid sha256:6b1801d4227e87e89cdce9c89a8884e781feba5cc155b6efcf16242cb23cdc22 +size 588155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 367203d708..4c735b6199 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2be6ef74d92c5456294de200733fba9b7aaa7841206a1b01d9c25a59afe69e8 -size 534932 +oid sha256:c78330ab8f49d1f5e3089f8d1ceede60496b37b1ccae591d0560c3ebfc9b613b +size 505563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bf0c9026d..0b07c1387c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4ca59b71c3b1c8c8a866d71f19161d2202c08e1036254b9cd87d4e38addbbeb -size 646801 +oid sha256:e1fbcf1c2d8d214053460f067f8db79f2be441f50b6c99751d99cd41f41095c1 +size 611709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 658ed022d7..a2178dd271 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46303ead18ab87f0d69e0e4699c517310821e2011b6296b054f2b70db67086a3 -size 619963 +oid sha256:00b8a48faf948e4120e747168ec1245c69b4c0459b120a07ba35eb59484d9a47 +size 586793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index effc242153..9890ebc50d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db97ba4574e3c976ba45792f6a46e614307bfea1e2f8f8cb762d963c03181fad -size 640531 +oid sha256:3e092bb493822f9bc3125679c63a400291ce199780ede4b496fa26f8e1c49920 +size 603365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 97b2e80daf..5fc2f83334 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1532a2496fafcd32dd0def2b1e32f1c289243a914ed178aac49dec5ee7b6c59 -size 612900 +oid sha256:97e258d749255cc4cc7e04afc15ee699c7b1f3b1cfa64a2aaf0167fd45abab35 +size 581261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fac9d30b37..76ee6f1baa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bde9a697bb26c15a2a61d93ab7584a8e3de1766b1bebcf87b55af780a45ce243 -size 634349 +oid sha256:20516d74e9c32d2e0af97fffd1f408ffdf8f99aaa4fc0d8e554d437ef8058b3f +size 607643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fa96ad0e4a..5278cc0ec6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bf3ada07f4d358f999d8d460d0946672b21ed2991c7e761c70af2db8a6078c3 -size 554420 +oid sha256:c7e90f3404cc4d5b18cdd1cab88d7330e7f82fb308e8675f767eefced93045e1 +size 524163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9afffca895..cc537cb741 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17581dd3cb0d12457d8fe00a25c982ace24bc8e8f8c6bae5595c0e267462f4af -size 604006 +oid sha256:86cff1955623351d45d822375391ff2b5ee813ab9008e74452405460b7f9f923 +size 581099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 111de689a5..c713a6b748 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:265f04caec0038780ec17c9a71dd2e9d5610ef3c21f89e9b400b5a53045ae206 -size 525214 +oid sha256:e8b975ec291b2e14ae47e86dc156a468b139c97ecac579c6d28897a587812560 +size 498409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7b55ff4e4b..a094b6554a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:729f3194e5239fefe0517c66136b36bd8bce708ed8748788dc22942daad835f9 -size 674067 +oid sha256:5be57d0f319c18637896402fea1cbe06f3af4535cebaaad1d343835d66e91ced +size 640258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index aa53bf7af6..857104cf77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cc04c31979bd134f829b5d4ea423cab4f9301f052fbf42220cccae6386036b8 -size 646439 +oid sha256:f7ec5278a26855f47d6ed9000dadd7b42a020aab46342bce30297a8ad2379b54 +size 601577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2ce3f084f2..c25efdecde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7145f97cb46574525b045df6911c15f86c395ca8f8492d184f655ade87285f6c -size 667797 +oid sha256:110e9505c1e561003a089ada841201bdbf9d81d448116f3b7d7617361508ed96 +size 630386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b1c021138f..40da70c08b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f38f18e1f27965cc254d52a51e757dcb75fd11e2a35457820d424bbdb04cc1d4 -size 640167 +oid sha256:89b44ab41f2852cedef3bfb8cfd0839711a3601ddc5ad442ccf3c46163f4ce0f +size 606603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c8ffa79641..7d5ea83ac4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea31dff1cfeb039def001c9c950c86a44060fb52ae61967ecaa8f564e134be2a -size 682483 +oid sha256:70d2e4b89b49f2ffc0a50a404bb21042b1d9f4dc624d38b59a45a9d2aab76a95 +size 640436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3dd6b3cf03..4e314d8edc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96d5421ab75b29b92c053e816c1bc92ff1fd467df13ef9bf1de606d7373f5a11 -size 574434 +oid sha256:ac28b886437596e419c09bcfd937b8ae6c74fc11226f5e8abf3811a650511970 +size 544423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index febcb262b7..6222332ecd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a8e456f17dd518ecacce321a37b8b89ddf8565cf44386b7fea43b6f7ba4ed61 -size 646173 +oid sha256:52aaf1685602f511e4be5bcb07bc04ef9886ab1246167a6e2a451b07971c1f33 +size 610833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4aab4cea06..bfbd04a97a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f90515bb0c5053fe9ee5b949b5abc7b43e7ce5fd9ea940418ec32b15c361ff4 -size 544586 +oid sha256:5a8229b1112f05ddc0acdfce5a9ffcffdb50de04af484e14d53e453494beac25 +size 517239 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 71c9549ef1..e4db6c361b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2a7dfd335f227e77a2808733727a7e82a90993999cce2e06fa85b8b98e6356b -size 665187 +oid sha256:f71a32d4b9dae5af0d45318230ae292e1a5232b8fdb62b0468f113294aed482a +size 633104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c508d65a57..47ad36751a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec0fe782608cde534ae3ee1995df0bd1bb2f8ef17fba8706c4677b7c1ab57aa5 -size 636769 +oid sha256:194470d6b91f1e30e8fc7d58a7dd0d3a9e32a9ed75e3091da53d53c671da0132 +size 595213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3c463497c2..7cc382b931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c811ed2f296e2a91c310ad4b87a5f459e0d70e214485aff0242cc7f0900fc901 -size 658127 +oid sha256:8d256ac3d0ad1e6301cf891c33dee1f2a5864e28b1a8059ac8fe60c256490db7 +size 623232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7d73fdb932..4b7321497e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fee703a36eeed6011deab3d0ddb02779fdfcd5ec64bed8e6eba61c87fd9d7e33 -size 630499 +oid sha256:ea74f590178ded491df75c985b582ac3381f065bff69d28136031446d98d5c00 +size 599451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c908cb299f..080d372381 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bed3a0666ba0d7cf83ba5235c90fa042533d1356ed9e820a0f78d5eff095228c -size 672765 +oid sha256:7192741be93cc766bce1a164efd5dd702a59997d7da35dfa9b1b7fca78d81d35 +size 633282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index adc70ecf3a..e31eebd96b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bf8a99e3b1a8887324fef3e6b34ff2d510c5ee2aaf6309331b7bf05e5a89f87 -size 564766 +oid sha256:d9289968ead99a56ca10ee84dcf642967a17d9952d651e9750cb7a84b59e5d1b +size 537271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fc5641167f..e5510a8389 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73993a5553a4f9a5a470193558d5e5e074eaaabc7291478c3192d9690ae8af31 -size 636453 +oid sha256:a89e556fba89df25b9c305f20e5c1aee2729e80213849110c331a3bf467c3886 +size 603679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d6f143314a..756210d914 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc83f525744be8d93d10c2c24225c974700d7ad56d0a3a5b9eb7f5550f56b3a8 -size 534916 +oid sha256:f8c01d1e2c7e953b890cc3cff470a317369215719d2dce6f1bc362e351984d9b +size 510085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8e8e7e9e4e..ecfa149b2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56310c16f4b141cb70c5e89b99b0ff9550fa241fb0c359f786dd2506e081b3fa -size 651541 +oid sha256:115aa2c495e5143612c2222fb61f10c78bc6f690c227866b51f6af8f86a6fb49 +size 616029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 56648bd2f1..8ddc7f950e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:555260e94dae1ae5de5849ec11b626a0f53fa5113f007faf29e675ae98d7f2bc -size 591444 +oid sha256:e46a35a076b1d4585a6693a319fbf73f9cae2baab2366dc14175db1e56c21fae +size 547917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6ad281267f..917e975a74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22409120fa1b72bd34149273b6db169371720a8350c79deec4517de5ad6cd161 -size 632597 +oid sha256:c930faa24aabcb438fc56c3b898fa116325dc28773f3d5ebb3e5af215bbaa4f0 +size 600439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a315e88ba3..b50c7d7927 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09c3ed08721d365e2aaf18ba9de1513aa27201b59018e89d001117f32d2beab8 -size 574868 +oid sha256:d99cd3bc48688ecb83bcc5fab22daa010072b709437284d7ef46c68f5d568e87 +size 536865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5be6b6365c..8464e33189 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72e436b918aa447f1279f467ee0bedb2c06f3dcc66de9b221ca780de38e846b4 -size 498906 +oid sha256:5e3cf010c0939317b2975e0f434686f0cf7ed53d0f289443b8baf5646a64e86f +size 467613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9b8a5eceb9..1e946cc082 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99be4aee59563d3e8bc35cd035bebb21e9256bceb5272ad0ba0b26b08dd622f4 -size 465260 +oid sha256:a92811d2c06bea90a7e928372f8ed6f6d836aae0b00d9ca4df5e9cbe93b8444f +size 437099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 33c7a1b667..e2f26d806a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b62a02c0b56385e74cfaecec06df81f4a9e3dcba334ceb12e070ed0e4bd63f26 -size 489750 +oid sha256:bd327b1f551549f5aba906a630b46bd6023130669c55841111e0f93437809dca +size 456087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b57f1a6607..1abdf15637 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4286291d808a6b3c86fc0cb06de5e7d191602e5acb34b4c3f75c032c39fd3aa -size 461628 +oid sha256:4a74e2acc8929a696260d3a15056752cad55fbf30cb67c88ee1005c53a0f4188 +size 432701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index afe1a127e8..a09dd08154 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:466831bfc7e43d44357e4c3e1900cc9ea72ce65b98639bace2f49a8cbddf94d1 -size 652329 +oid sha256:49f0e381d12b800c4f9c41e2ac8e8a34a6351433aa2ee3ce6209ba13199d8460 +size 613363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d9b109537c..1333cceda3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e87cd508761cbd1afae901782fe81a3d0a8f7b162e7eb9bcecf48876b0887137 -size 590678 +oid sha256:293a2ba5bcb30e78edb08d22ec23082ecbd144ee29249478c1345bcd7782baf6 +size 548777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c8e24fcf74..e931f70cc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eb0960dc4dc6e79e8dfd0e864e2b4ad4111ffbfe14fe64652d391d667e4a7b2 -size 452192 +oid sha256:786711dec5cf6187130584317148b89101d0f727619ab3835cf582880fb63d1e +size 434317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 61352be059..84f255a223 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90718d3eddaef922baa20e2e7d166e8580c67b73de5291464319931875ef36f6 -size 400532 +oid sha256:38eedce2bdc8f5d2fd6d69fb294c775ea12ce30c19b1da8950ecc743ee5794c6 +size 375553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b92ebf36f0..53b4d1008a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6f6978b50ca3b77d6f3e08e27217af9b101db75b9b3bf1581bb8dfd9a94b9da -size 433542 +oid sha256:7f8ae2b4cc74adf8a441021d5dcd1edb6054709d54afd00592a153218856e22a +size 418035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 887cff0e1c..020a73cee2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:045d7db908196f995e62ab2ce884a43a1edeabecf30e73eb05102ca51b24f947 -size 383462 +oid sha256:6a03abe361fe27b3c6ff9e76540f1cdf9931603a1d3d595bfb5a6cc72c4c4467 +size 359271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f485525b7f..5ae55fd051 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cda93641d9f2f71cad388513557f67ffc38112284293e4cd93d31742e5722eb7 -size 482330 +oid sha256:229d6275addb9d3dd8585d88819bfb9cd84d3b5a2d4ec5598eea7b57ef876c9d +size 456563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e059dd89dc..10d6a976f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b000e0b4148d7a8f9d9a320d9faa97dbfd5bc130b88488d09f3fab37d06ccbb9 -size 448684 +oid sha256:8ecc4425857b1c31251100d5198728ffe47968b4d51be0f7455abf40ad853876 +size 426047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 61adfe27d1..9a0475a36b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b8cb786f776a0cb39db0327007a5bf950a20e65596ab8553a06b29832ae166d -size 472384 +oid sha256:269ce4a55d363cc726b8dd1bb294e7c1db60532d8a63911ce9ccebbdf91b9636 +size 445037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 697c6d7eef..007917be45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0af184543b2fd60f0c3baa0b1049e00cb542a4a8d032fcee28ba870868c3354 -size 445052 +oid sha256:0e705518113a07ca848119c6bfda40b7a597b56b6cff377c6b3224227bbd4982 +size 421651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fea6fc2021..654b26010a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b3918b0e4ac7fe63abbbbaed775ac55c32046a839669728470b310e313a641e -size 629635 +oid sha256:163400ff6d56bf093bcb6a73c44e681ca7f3d7b74a1392897a7423b9f42391b0 +size 600437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index f1819dc367..60d58bbb8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22df2edbc5f7b07de33f10d04544305d0f4b6d6706bb21d355550f445154619a -size 575680 +oid sha256:60f4608a3b8d6635e7f6588b0eb9faffd5de878f47ede64b41cea8c2ef019955 +size 537727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1217a1b54a..daf11dc1ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:794a17a55e525edf715fb693e6ae60b2a84182cf4acd259cd4ed7568996782fb -size 436406 +oid sha256:b6fcf28df97676ac50bd61daaa8e6f0e20bb3a72f88e06ea64cd37d5feb80dbc +size 423267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6907308fb4..731b65c001 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:338fa97ad32c4e54a980d9e3966cc5d3137128abb4e10fdda8d514cb2965dbdd -size 383956 +oid sha256:bd7f2e5def949d63d6ef633df87a3666c5ad906e8a525913b0375b768130f20d +size 364503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f35e937da7..1fc7029288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27263657def38ea1fafb576dcba188e9e215d0130a6e7d1d2f8eb8ff53b666cd -size 417756 +oid sha256:b4e97affbbb283edbca3022979ee7ce067d95b46c23810f2c5e155cb0750d544 +size 406985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d0be2fcbda..8dae909498 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:908485c46bd286f97d35f9586a56705b58ae22ffc4ca3121e2e31ceaf45fbf41 -size 366886 +oid sha256:93741db19055546f371b01a876080dfa43fccdea2c5fd0bb9e28ed0baf5ea057 +size 348221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 80ac72f9f7..238d974310 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d469cca1b9394951a46c4ff8fc68fd49df02dbbc8eff71bc8108ec48009bb8e3 -size 512360 +oid sha256:6c34be61ecff0cf7339f04c7d31f5888e397aecbe136f24d7870280c21090247 +size 487381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6cf7b4ca87..660f61b2a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3af41c74e95d90a99cc60d8da85fb88e5353c7fce4e724a2b38b2d94786c74c -size 478712 +oid sha256:791ea86574e5e600775bf13c837fdfd3a02bba935d560e673e8a2c81dbb77645 +size 455287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 60de7f0e79..e0de436ff1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaaa949e12d3b5d4a3e5096bd09e63de16743dff467d3c675a5bb18a390324bd -size 503992 +oid sha256:dc56571f263168e849174791bdadc3ddaf43d0d9c340b199ca62c3517f614042 +size 475855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2b590df08d..bea462b91c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9261efe8df2043f611269dab4fbc98d3be6041fe1ddecd8ea62322278df1f663 -size 474290 +oid sha256:417a405f427ff066900c22fa9e24abb7d48e6b94d8f5df8b46140340e04f049e +size 450891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7d65a6081d..b756157ae1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a0e4c606f6c455cc0990045963111c4ed43f0aac032c73c44d398935e2d5b34 -size 694297 +oid sha256:642b93c055d0d3feb102d824220ad3cbb0fda8224bcf7bb189833ac77845f289 +size 655628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9b776ef940..af911542af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:663df4717688c5228084d4799fd71b3d8d52deb7c4dbccf7062c222716ce0dbc -size 630599 +oid sha256:046ca5200eb3ecd7b1071cd29fe1787ed090fcf0a622d840827cb93aadf46a30 +size 588155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2bc1311585..f7394a2e76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f99f43799d24e020f1b7f8130402bdc4eb23fae7ce9dbcdef3fd18e2f4c23a4 -size 486834 +oid sha256:85bbc87f1654b9f5510a7b1a54cabda14948a7d1b5caf87b528ce6f505327e6e +size 457933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 58cec72271..fab78ab01b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cb792a61669ad8359b2567a77b31e58a0a2df6577767e087ceb1f365b962829 -size 404390 +oid sha256:940d0fa2f9bd916db50593d2f168ec476760096f8716dc6c86bc80437b36bd8f +size 388093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dae50f74c9..ed3e3e396f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25be7a3c0b52b5df7804786e00510b71e3f0d4c334a9efb7130bfebabe457eed -size 463448 +oid sha256:95bde43870a1ac97c88ed1ae5fd14e352f702b79cff4ae68ee4f51ce3521de7b +size 436915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b883c9a3dd..0b6a5d0eea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33fb54c327d6d9d341e854c5d5ba5ef75116a3b8ff3ad973390150f6baabcfd6 -size 385740 +oid sha256:35dba62dbbce48d177416f199e0cc559afa57e9f191f823a6afadf140d9f44c3 +size 370233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b4c0e656b..51bf4b23ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c755e4e683833e7669a89d25f6611c0dfbb31f09f7b6fdd42b54b25b4f2c1acb -size 495784 +oid sha256:32366e35fed185862127fbba7771a62fe413aa325a44564b8e1b078d786f6695 +size 476329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 14583b2e37..5759b79a7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3190e8032a26eea81515de2cfc5f8365597a39ec3f2b05beb84b4a05851a3cc -size 462136 +oid sha256:d6a6992b9dfd29f1aa513f4c628e2fb037c95893fb1951da45097500f0553300 +size 445025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ada7d5d8f8..a19b4aa744 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1e335ee99423fac36b7a93bcaac0988cd3b2dfc3cc1d237b77c205a3a042dec -size 487416 +oid sha256:c27783c9d96be08e525d1e51df39d968f886f057e51c1ab887370ab46dcdb53f +size 464805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e22fc828ff..65b42fb1fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43fe0f4d770d725c896de2b46e0676f3e1ffee35c0aa9e07d9ad6e4838e064df -size 457714 +oid sha256:8e6618da35dd05fb0d2a3d0760f0e9f68543ca54e59a995fb393813a282296de +size 440629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0f047d6ba7..6ef31ee345 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:339f2af42ca066009e88942ba142649fcfc2fa2424c378c4253180889363905e -size 675353 +oid sha256:1c97af5d8f4d4deb5ab8e6d8601383d5fdae4c2c6e369d25ad42a3a0482cc9e5 +size 640038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index aacad69166..02aae95636 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85c19183dad3beef33fe21fb76da3f72890f3a0734c93311f51cf8e3c7c802de -size 614022 +oid sha256:648ebb4a2b379dc8db190b47125899ddf7597d0375d34f701c80e83602890f25 +size 577105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6816be00c2..c8ef179d3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2744a18159d36ccae0e4977865dc53d39c84db776d80b28db2de0d84b0b6f4a4 -size 465522 +oid sha256:fa8031b0b358f98f248c7f2bc08a1d305986be9505f998a7020ef588868d3b4e +size 444515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a2d5c9ec44..faf21bb288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b92a24c39e7fec7cf44bae64fdcd75a78113dd0e5a0e64632aa2e60f2de18b8 -size 389392 +oid sha256:1ac4953e05b24ff4f5a60dd715180f76e5ac1c316afcd980c2ace19000eae6cd +size 377043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4022f4708b..0284523def 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b8cbffce676d9906b434cce39aa12d2e2a63cc54bec5810aad3784cd5f29ddf -size 442136 +oid sha256:2453a6e72a1c8ac893bfcb31108704378413756baf6b13476896609f2e0b0314 +size 424285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f89b72bca..b234d6e7a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:408a233dec6a857f9334ac1e26308d79c6e1fe2fde6ea68bb331a2316826d592 -size 370742 +oid sha256:1cf35ee7117b62c4822c684e09b1daa4e0ef9b4bb1126d7ef71fa95a9dbf3cd5 +size 359183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 753efe4804..92dec37f88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1484bcd81645a968d2eb9ec0ffe03d3454f0543b7478637b6597b73840b28c81 -size 695399 +oid sha256:99828b284ef84b985b6b70581c88e5ad4432c6ddfbae5cfa55c1ab82986bb4de +size 646764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a902017051..08e0d03928 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf060f84ca6ce357bf393524f6bd53448ea651cf6120792373db2e80bebb5006 -size 601312 +oid sha256:2e25df5565dce3ec5ac75f240d19eba38e6237a3e1ef85589c8e6e8331cf1ffa +size 556599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 81cbe17f4d..12e5c1cc90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f15e0604469712d15a805102c5d3cd380ec1edf8b013cdc7334a27638cf1ceb6 -size 656721 +oid sha256:d2b9a90cbe87471debff459204cc4cacffeb13e0ed4c27db577e30750c9729a4 +size 623084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 89e10fe878..9fd0ae1179 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc1dabe4e78f174cba5075865052c54b6e99fc24f5197cb56dff998e30c07881 -size 563424 +oid sha256:323362c7ae8431061d4c3ea1ce95efaf5a142ad078e3e979f9fff31754782e54 +size 535287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3e31d485e5..1fa66f283f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b528e8daa17f478743ec8224a74d0ff90ca6d85c4958049bf63e7a974fe43426 -size 635463 +oid sha256:1a9a0308c81316df5473fc5e6caa3715a1742a017814a761986829214533cec9 +size 580487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5be670af29..122a74d589 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b10a832e37754503290c6bb69ac54995f99ee732fc863134666b685e0a5fb33 -size 607340 +oid sha256:77ec974c852a9bb87d1c39faaf7d4dd8e1e571ad16d7716d43dde994a9f8ff05 +size 555499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f7ad181c4c..06053f06db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee9c6a033b5f81247ca7024d0204e4a71cbbbab1540c8985f3a949db4325c705 -size 623937 +oid sha256:fdd787cfe1dd6319147acc60b71b4b6a415faf692c262e5fcb3ddaa6e6a747e1 +size 566595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 96294cde98..fbfd2c9104 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:806338e3051aa054548cb130e9e289e722a2f2aab2792da77ae57d97cb42da50 -size 601340 +oid sha256:0eb597b2747363bc832f508c80fa867f5b0d6363d6a28606a3694cf5e0f3da4d +size 546365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 25b90b1459..1595a70c5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9889b4ec75a58c267dd0bc3e8722a040af69ac8e8a4c617fc21e5fcfc07a216f -size 700503 +oid sha256:153dfb1d196c64a2ad3d77410e4d14d05425296b64fc3eb006c47e8b5f3f7bee +size 649920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8517d79274..d7ca5ba09b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe755063049f5b6a5f6e030c4fe52cf4eeaf70081106d6bc75eddd830de8338b -size 651581 +oid sha256:be2a4dad87fa1c3fecac187e3db4d8d43fbfa4f9f4b40d2494ebaad6233db43d +size 593055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e62ad272ae..4555d0dad9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6021d347bba365c352afeecf1da46724569a3f95c1189336bef9bb96e441624a -size 580064 +oid sha256:3434b11403fce6aa5321a436ce0400aece607d077d0239945edad4cbc8d70c7f +size 542457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 39300485df..134a64f287 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f75ab84ab35d03d94dcfa7fafbc737eb0f8087f4d0d0f28fff4f01651c06304 -size 530772 +oid sha256:3051a991aac346f237aa4191405be4d4cfecac5156e61a642439b9c557c8abdb +size 484481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ef254471e4..b2f40caf43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1fb1aee4bd4db59f2a84084e24ce80efa001f4485ab60283add863082fe38d5 -size 559836 +oid sha256:c795203c3675d58c5c0c10c9b81c41b8067d0a63d180637bbcbc809ec2290006 +size 523807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 40896153f7..c521c0c8be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0289199a820ec111ebc5e4e35c4b0e25c4daea9ba31648ada302e2de4ba5f2f2 -size 510544 +oid sha256:548611454abf0be0ecde0fd2981038f83b6c6d709ed62f4359b54acf4599ab77 +size 467411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cc790145f8..718156a983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8cef8397ba141f01c9d0b6c12903e5fa5f6f694082c23a40de708935450d7d6 -size 597574 +oid sha256:eec5195240e85b351a9382de174cbff61fe91ea62d459d805319774ac06058d8 +size 559175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6839e4b7a5..b927a2cda7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d3d15637125c36b532824aee8a67c97772d2f58ac7035f38f7448070c37ebc7 -size 569452 +oid sha256:66602b296d3316cfbdc93f72cfb5ba68b1acb70e6193e6338dacf5a0171b7b94 +size 534187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5fb4f434c8..df49671e48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a7c695377888b5d5ec9c8ef6bfdf5e2cc4f831cf5c894b14978f2d722ef6ef8 -size 586048 +oid sha256:235bc5c815430e7f96a9352570bd336f560176e2d4c61c1f9dc3ed873476ac6f +size 546071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eace0c3c05..71490910be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d7b523a898939eecc138fcdc90335de1068da076c0470f67bbb05d8f7d40213 -size 563452 +oid sha256:c3feee329ef7f5fd3bc6b77953f6033886ba8ff5e0fe1385241925fefabbb793 +size 525843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index da7877f89b..d4a1d91104 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1da58a80a0896b2ba6f25cb7d681c70413f9875e6b16fa553079d60230c0fa9a -size 659877 +oid sha256:c73f64f5ab438bc388c7c748d9a19e4268c4f3020990611f36692e148fe5bc2b +size 626240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8adcb137ce..bb84e278f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56fd9f759644e959638ff9ed2dc3172415eb313b044554ad13ccc48ab051e157 -size 610904 +oid sha256:441a89b8d281d00cfb85f196e45d809d9c07ff21001f9c022fc4d2b8d8e8e046 +size 571743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0b6c6be405..76ee8e529f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d324c7d8761f44d0e6fc08aa59b85154577d656aa86fb7426b77ce922b5d43ee -size 544544 +oid sha256:e90b517952437c9d424ab1623f2e0d61929e373fb4b4dc005b7df67fba861629 +size 521145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1ec8c8cdb2..a9aae2c1c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34fcd7eb396367e535030c3874d40567045ce00c1cd3ad2259de05ec3e702ed5 -size 493674 +oid sha256:198536b482b719fab58bc18cc3ce1731eabc384e35e281f5bf7eae84cd3474db +size 463169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 79448a1c80..6515201212 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d23a9f0150b56e7e3f74826001a3cb933b4f7067f85e2d225889cf31c45bcf3e -size 523526 +oid sha256:06520a8c5d7f2e7aad060c64c20888e3e4eb714626bad0164005de9aa2fad0c7 +size 501705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9b40fc601b..df007a9eb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8c446df9ef7121a6f439f49249c043165a5be2afc9ece3ad8377bf0071c34b9 -size 473446 +oid sha256:b71d92b0dfe742305fc8afb06a49a3969a8f3c83b7693772211210fdf60ca784 +size 446099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 200f624a9c..1ac282896c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a77a40c1ee50875352ef4d8d4c1eb4321ebaf3dd3d074d45b29b9a8212c94d2 -size 650493 +oid sha256:b07e8760c8918262a99b3af10a296ff5b188d68998c83b18d8c2c1f2831826b6 +size 600255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index db33e78b2e..91a3ad5e04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fdf6f3ccf5ae1fceb5f19382c370e842f9a887536f5eafc7354f221eda8efc3 -size 621583 +oid sha256:a5f207f973f0cb107e3b396f4405858267438ef9cd92aeed50c8d6acd2ca79e1 +size 574477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6849116a52..f44ede57c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6caf9d70f379144996dbda05936702e35b5df5ac234e75c8bc952a4322a1f7c -size 639757 +oid sha256:41a20bf8ee1c0ca28d24c96d15000c1331fd1899ebd77f13e16b0a0bb6108d14 +size 587151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2a04425140..0217145fa6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ec6196158cbe9d355a31eda573605221ca1232e3db939a67fbc65b9a1963387 -size 614792 +oid sha256:03e4109e60edc64198efbd3b116aae5befb61908822e9c7d2c09b122154be692 +size 566133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e2f5a5ea46..485731ed82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07f379da9f5a09951d23c6c1e745f862aa0e212ba8fa9c1cc9ed63c4ac1e7f8a -size 740891 +oid sha256:d0fdcefa684c310644dbe06f7b4824182a33f86abc3a6c9ba10b671cddc00831 +size 687152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 669fd223ff..6d1d01916b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9511da697fa0b082daa0558c80a08816cfb1acab4d797f8f49aae8d9600c2773 -size 642661 +oid sha256:ebcd2befcc678ebb32ea9616d8e5de32b9cedbbf8e2210a96e6dec3bb21f3abe +size 594617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9a39fc0bf3..aad3acc433 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70067364112aad98bfd3a1982b2d935164c866be6292c3111ea1c0e5c51dabdd -size 614706 +oid sha256:cadcd53d0fbfedd67ba88e056a7e2d34e9616cb456b549c1269e399ec099e936 +size 565283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5ef268104c..00cd29130c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80a389901d02daa8f839d9faacbdeb520efdbeff9d5272b6ddc2371a962aa5c3 -size 532262 +oid sha256:b3923380d971ddb04cd9de1f92ec3af0af2f352749f1a1db3ce2d57014eae219 +size 492285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 40b7bb4854..9cec013066 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5185aad9182b4e78642aaf115b0148e8be7307c0f85028eed0df2bb54659d48c -size 589742 +oid sha256:4e6c156961017b84d9fe29cadc4d74773b41cc6968c1bfed2b1a9cb5d3606b4d +size 543475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 845bcff5b0..0e0983a38e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:331d0e3462fb919c2fdab184e4f92271db54a2aa86b15b13c9095fd11999822f -size 512822 +oid sha256:4f5775bb3f0e2371eaf618fcf607fc0666a0fe2e49d736aa04333c75b37c8960 +size 473635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1e2a5f137c..3d482c8db7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53c82b399f9343e62cc1c42d93daf9787a5f1aabbfbe221bdb4d6b70ab6cc200 -size 611816 +oid sha256:c35546a25935449c0e719318d2ba786369ceeb33671390dc7468a8b4f8811798 +size 579733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a95578220b..731eb93aa6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82127c61f336a432d679b900bacc8e1a20e96d4cc907b8f309352289a638552e -size 583694 +oid sha256:73038764069e2d1797fc20274ff62030331f0c956140beaa8f2f3811fa0a947a +size 553165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 12d4a4dc53..8d8d807bfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18ca94c5803151ee1e9058c0960f8a130832cdaf96ad11fe01bd86e588e8117b -size 601080 +oid sha256:22bdc3aeb9156f4cea64b6ae659d7cff7015096b5370eeb19adbe3132bcdfe2a +size 565839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d42c2c384e..6eff7a3590 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:903c1edf747b78d144b3520c584c20392cdd4d69f440a25f2892c3adf2491da1 -size 576114 +oid sha256:0f4ff2ed83f4674badd4c232461f6718a0293e74582c6bc720d439579e82baa3 +size 544821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 80eb3fbd8d..4da2e705ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0ba9378abcde0210edb691f0713f7aca02f6a6e6e84a714a995458eccc211d8 -size 701055 +oid sha256:68f9e6b36234f9ff3f6ab7ae52d0b30d2ff81a728191543976c1c633b66f0cf6 +size 663472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4734ebd218..78fc8f406d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a67531fbdf2365171207afbcda002830dce647c86d745ccc140d22ea7e9f712 -size 602232 +oid sha256:f0387d68ea06e6edc9eefda0838506641952363eb2de89473111cf1cec6070e1 +size 574095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0bf8f61f3f..a5c6345a57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d581760e9e0f32cf3e0b38251b9b85068f729e264caa5b4215a157ea97c21639 -size 574450 +oid sha256:5fb9d8442531ba3f68d297cd4a82fba1f1b56b617c72133948bde37b6accaa83 +size 541603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b1eebc533d..7d4c906bc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39781347a283f5ae9478c88c32ba732fb98bf451bd25c1c77816f632b6097006 -size 495164 +oid sha256:5e6fb357c347bec49ed59d5f05577826834ab222d38bd7dd45a831bfa806c17f +size 471763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 69d61905e3..54bc7a1adb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f70d34b8e24a90ab1098931582ecd924fe64cb41232fe2b8c49ac050196ee80 -size 550274 +oid sha256:5da4888097f9bfc17e6214cbcff17bd4239001a11e09901fd40da6293d6fa4f0 +size 519795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e1e461bcd9..5b579863ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06893077258511c413e5ae6ab23bc17e866f2616bfd41fe93bb88d56e70ed8c8 -size 474934 +oid sha256:ffb3e4e75e909ea3fd6af71210182f2e3ed4ba821ac6ff0eb61a09907a4f9689 +size 452323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e493921071..7663a09d67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:199264a4b4010c50fe1244a546ca51f3f01b5d28baf1d6d16b8afa14a014cc10 -size 588588 +oid sha256:4a521f852cbf9f7fbbb8c0e059ac01791cfa86eb6e43922d8b6838d126b8b2d0 +size 545207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5a8a0a98f9..8dbfad1339 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5766ea6e51097ee3bd6bf86e618d04b7d5fabbdfb47416c90585c9fa1416b969 -size 525630 +oid sha256:8a28c1a2078f6b709c1774429ad94a581248e672eb1eb0ca98d44f069642573e +size 486467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c2d67e6bfc..0da5bc5117 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55416b1810ca19e4c29723c2757a52e5eb073b861a7d2932ab93d5715a8c664d -size 566954 +oid sha256:1e337a4385a8df0ae0308dcef6930bd5457bb5bff203b8ae0a458348df5d1e33 +size 531789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4817152257..1e6de730cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db869afc2251d39d75b99e3c9d48d1ef7e1e89904429c6e2c262bc2d8ea6f6df -size 508264 +oid sha256:ec673d0ff40ec600f7dcffc47f07a3937fe05673a9727c55d5bb7611e451e3b7 +size 476207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 810cb5b4e1..e6473e58d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:597cb28bbbb959384f10fb4628bcc295ab4dec7de8f52f2dd0f0d0650fbf52ba -size 472262 +oid sha256:508c33083ee4a07d9cd19defe49fcfc5dfc011b5d7bf96472a2399b88d5c6e3a +size 440205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a2d33b7653..616e5d6874 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:926b7adf986e9f3d0e44b601853c41aeb4369730ed257abfc2ca618b4c02d420 -size 456572 +oid sha256:08adf1c0490ee721c971dacd682ce9e94afef7858093c04450cdaecaaa45875c +size 426043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e660b6766..6a305e5e77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc131f4c0f6095767d3d129ed625324e01e9412cc51eb143b7f74ec8bfbdc059 -size 467052 +oid sha256:d452778cacd53530e281326efa95674c4ff3ddf47f7fde850afd1daaa4f58726 +size 434969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4bc97e5e49..9ae269475b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a60590ee5b89bbba675b17c8e345eb0b82b63f7b86e88e599a9f1f67c5baf2bc -size 451362 +oid sha256:f0c39890b2b925e8bac0a0fa53303048ab0cab72ade9255290141208278e3347 +size 421647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 13cffeae23..8a6ad1a3af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a353737ea049d67b22a872c9e328dc6dbb3715cda13e9111ce0fdb6e82046826 -size 588586 +oid sha256:813040a0de8d1296ce06f4fab65fe22782cedf7f473b20f44021bc25fdb22eff +size 542837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d27a909f09..61fb304700 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:930236bfb8c421d3c5445c8cda93ec10ca0d7d8a88bba7db058bc1cc43b474a0 -size 526492 +oid sha256:da0ab942a369bd2a927301b2c3a63f605af56d34cc14501d0bf3290362756ecf +size 488119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8ea898a82d..e40fbef289 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28f4a6c2d7139486a85f75a57596e2820c27256f8cfc2e0e1805968ee0ad1d59 -size 439756 +oid sha256:bb38fd8006ddd5205e0ffa6495fdb736d93ba68e397d31c2c8cacbcf428d2ead +size 421881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0dc441b8d0..bd348a7897 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d70b54731c845de6cb29bf44bf72f4779ce0e02898ae6f26ac977122e579de7c -size 391254 +oid sha256:eebd5a8c5f10ffa405f1a09e99a3a2283e1714c900dc9cf19de25a808aa26a7d +size 364697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a3552417a7..8c6519e2a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f86a2ec5f715c9acc5fd4cfdb918d856945e0156d4143e37be60c196aca00cbc -size 421698 +oid sha256:b711a24e7067bf7d63715452dfa0f611e5f1a8f654422b251ac60540b3351934 +size 405403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b40f051eeb..a8529957ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56f0d93bcf22f3aa1d32386ce22217f5867c03f4162b58cf10aa86399df52533 -size 373986 +oid sha256:5fa12de180781afeaa18bc2b5dd291f1e7d1f23aedaa315af32916908d894276 +size 348217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 59c3ff1076..a940cc222d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba753835286e600ebeb9e1b19928c5ae6be7af5affd823a58b66d06a140ff296 -size 455686 +oid sha256:d92c7eb540ec6386a7f728d02d3e49ce99b69ed381e9a5a9c06b99c55765b3b6 +size 429153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bc57d2c324..3f475efe02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e420b50dfbd8f4385b25c980d421971d755ee21032c748ca434bb69f81734b5e -size 439996 +oid sha256:47aa7cf998bb93be07f75fa7a9ef698e33490ceff02657c4b12c41aff538b32d +size 414993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 96ee4e5aa4..85374ece73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8fa1214b7cadb2b4146274d9959e433c8c07122717550665d49315222caafbc -size 450476 +oid sha256:ef48fd69e0a48be8f0eb42c93c9affb16662499c33c1e8de8148355527aee044 +size 423919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5db8ae5fa2..0e85733432 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7fe3cf10d04c62b6eae442917cb4dd31f7f9901c0d3171799a297279f0f474f -size 434786 +oid sha256:6827e2cf55c82ae8d5fcac487494a64d489479e36bda4de481f00e07d02b7c13 +size 410597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0c5cf9aea9..8db184e5b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36aaa0c2fd57b3514de2a7eef4fc9d530bd69058c2c70418ad7e0c3d80184daf -size 566164 +oid sha256:b797e18608601b896ab679fd956919f7ef50bde095ea4e274f180a9d1b2e5786 +size 530997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 919e2f6220..6d2c475dfd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb4756fc59ffe3990cb0af4218199a24180f94310784153d19e15ecb098b4c99 -size 509126 +oid sha256:e6497bf8a592d87a04b50427fc3e2fbf0423efb15176f69c9e696269492f4019 +size 477067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2e1493590c..7e59bdb052 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e8d925c5ff36ac58ddb21676c15ab87bfa5622cce8b1dff10dfded49c56bce5 -size 423180 +oid sha256:4d25191b51fe4e88761b0c56f1e04a81e00a8677777aaf0906c7476de6b00157 +size 410831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0bd0742a31..2279c97155 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7864cf033ae6645fe7dc20ac44436da0c7f599b949b8096b8592b60404f5f2ff -size 374678 +oid sha256:6a31bf74ac2d2f70d94a3514bb13432f4cc64a0cabc45602e047021778d61133 +size 353645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a609073343..613a2983a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:523faae937aab65019683c7d621f25c015c871cc5a2fac7083eeb0befa6e2757 -size 405122 +oid sha256:b1f709dca74881684a4da6e3ce50e04dc3f1c8a5c3fab458203330c5ad57ec14 +size 393563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d0a358ed1f..e268b8cdc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2efe73d103bd33800fae06c81435eafd0a32a9b5dcbdda22bdc4eab2b81277b4 -size 357410 +oid sha256:91968f7e1860bf62af5d662d21919b091241f974fcef64e4c20af975d74c6319 +size 337167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d737ed5084..6c8475e4db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e491c77b84ea43da36dbf907af3b182eb9cfe099bad55c30e13983c7e661ad97 -size 486504 +oid sha256:dbac2b46bd4d10cbc7589ad20de63a8cb551db6b547bb374a1a6348ef2fe8af4 +size 460761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 01c21d8500..f08baaf259 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50a6d08494040cc80bcedc1e81d9062fc9fa086c4400329195ad3d4066042cba -size 469236 +oid sha256:6fe7c21130b8811005d56b484b1bcd06e363f7c8b7af1f6b6591d5470701de50 +size 445811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1ac222301f..74286434bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:695859685e0e26ad0de9d2b29a74a974345477909e259e41736ede5401f10be3 -size 480504 +oid sha256:44ef205aed29d32b4992bbd09764f5004918610e13a8a8e81e3b2857b7f947ca +size 454737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ec4c2a4895..74ec4c47ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fed9b0467f2744b1c2a8f4b70f51eecbf70d90c820cb8bddbb1c69901e91bbf -size 464026 +oid sha256:41a967b547c397dd5b8644da007bd74be5a687fad2d1596ec4bf79a368c85414 +size 439835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b3bc6c9c5..00122e3d54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44e738d4e0635db87478b422bc7070ea51ee555b63194082ab6119ed28b42693 -size 631343 +oid sha256:28ac26e9fc5747a8934a36433b54631626ade54114ad706b24460467854f5204 +size 584805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2a4a7eadeb..fef7f90f85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d47359ac8a47629c5b54931289991aedcde61a31c9fd802e7e1c1fa1e66c094d -size 567596 +oid sha256:33a90a39eb27fbc55419d23293639632e8f861000b913ca6c4e6b9a5c16e8a75 +size 524487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4423894101..8cf801250d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:859bad070ee7a082874aa303ce6fb04e598a3a941160b576a3f23b067733cefc -size 474398 +oid sha256:01870550afb3f9e9df939e013abab22ac38ec63a714f020a9f2b273fa1a01ed6 +size 447075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 68376af445..50619a8867 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca3411311098ffeb5441fb72abe11802d428438b42709a3993705746d8ba7463 -size 391954 +oid sha256:f61fd8b4e5e686eba61480be7b411aa4b08f9c05d4d02948ee303388189a0083 +size 376447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 77226d419f..dc2eecca06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23c458118fb72ea47a08d23d5036d165871070d077702a68227e6f7a7246902e -size 450026 +oid sha256:0ff4ced91a50ed0fc367d1833e1fe6c76b2df044d4fc9acd61d17308c1200352 +size 425861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 611f1e91bf..890936f6d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3553b844b10a0fcf791cda873fe33c55c0aff9afbe815496e3f84c1cac8ebb6 -size 372318 +oid sha256:91e5eec196761e70d969d97377d04e3b6c08b86af7db474dc6d906d422630cfc +size 358389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b581f4bd3d..80315528db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d91aa9804510ecb4a4cbe1738dd74495dec81743274a68ef4b2820acf3b35fe -size 469928 +oid sha256:ac9adb5a439febf44250de85384b4f3236daa6870740ea709351600912297e0b +size 449711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 842ca712f8..f2595cbd92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dff6f860f14ded4f82a9bff4bfa9cd4f74b03bc21eae0763c005c1f1a945020 -size 452660 +oid sha256:fdb2f94dd5699664c3b027ee5b382258b0b18a5e243881e288105039ad026181 +size 434761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 84093f4dc5..19bb679115 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6984c035d21b487d4b5323b2ff0e736f20fabaf40d233db2ccdfcf239f36d29 -size 463928 +oid sha256:89ebbe4e749414c1eb3ad7eb98ad261adf139b8ad0e7e626a315f19f6e319bf9 +size 444475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5e71050b5d..1c66d6f4b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e236624de72b502f7c2c36f939b9808b87c11d90aff1b2eb2d1ca011c06900f7 -size 447450 +oid sha256:3d7dce09ad9b8f3e999a84f7ee590789237fba8e2598a66ec325cff57cc37637 +size 428785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 263f91b562..6994544284 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a641416f214a6084414c52a8d54349ea8e89447b78cbd5de634b476abca09dd3 -size 609710 +oid sha256:03509c1bf603d230eeb0ac4d6faedbd75147226d12d1553bae45ba99551aa76c +size 570597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ca27a0adc2..55e0d50e86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5434eb30781bb0341d2ea3cb772bab4400cf480c747efff739c32a89fa86542 -size 550230 +oid sha256:1ac73f5a78f6c9487ce79b2eefed12e2d33a8ccaa85136e57ae49d75021dc06f +size 514225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8a5d8d9c34..c51a0767d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61276221e4b377645cdf965bd5d78ef5814155f0ff00e46eec04c68b477130e2 -size 453086 +oid sha256:b89714e2bd2bd82d44d84646cd4f222878c4668c531fdcbad8ce86677547715d +size 433657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b3ce648ee6..8a2c7fcb7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d987e4299b8e0db81ce9918257e12ec42686c465306bbbac5aa9cfb324d5c5c4 -size 376168 +oid sha256:be277a03b4b3d90d92bfeb5530b9b542ef9b3a9e69c4c62e8f4963da89ad050a +size 365397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ffd927c0aa..15ffbc4c1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45b005ca51f406462c4ddcf5638db48ca7b38dd7237d2f059dc007a4c1aa36e9 -size 428714 +oid sha256:d91b400eec43358ac6477faa9dbc952683ee8191bff84f5832798f01a4443f02 +size 413231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 184b7c487d..e168741cba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee026bca6f994945ce4b9df95b8de728885913571ccb196889bcc6e0cdfa9c4e -size 357320 +oid sha256:eec7082de1a0539f2d512618745af39e195be3bea829036b386c92a13b7eddc3 +size 347339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dd013bbfe3..11e1b009f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8603f47404a939f550b0e5e89432fd3035cc07db9c2ebab7f736743a520edae -size 709267 +oid sha256:2d7f04519907690bbdd2d7358a8983ad82931e48d5f199257d457f83876912e0 +size 679356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6529be94cc..7b3723b397 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5de627bd6b703c35b6e80816d0564a0973170a80ee8b3ac88cab57c597d80892 -size 624407 +oid sha256:f27fa43891d9e089bed8cfdbbb49ba4ce2ae8e442ba7bd7c90595cc21be61e2c +size 593211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 4990d27fa2..53ba591b1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aa95b30dd2a79f3e2b9a49eb357b01b99487cbbe90bebfd2fc536250b7d38e1 -size 712621 +oid sha256:9ab85ac30a87823a0fb2b0c03584de007a5c000b161bdedeb48e98d34d51da03 +size 683646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6e51b06667..e0261a155c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbf4fd028462f382807c07ee960a3e9e18e6f725464ebae38f18d661c92bb79b -size 628597 +oid sha256:d3d6b05c9e68569690443a80db40420382e9175b0a9b6079d3c8a92c36437020 +size 598833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9e999a133d..d85c2bebed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b675597278450762d649e84426dac38e7920ac3429843e7c401ccf4da59e599 -size 775703 +oid sha256:5565327dc94b8377b6c4995a69f30422c0c76573034faba17ec59dbd83d00af0 +size 746382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 10ede0eeb9..22efe8ba1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7da541750640cbe3d3c607c91913835498c4e3f020ad985fd32e84e81ae0587b -size 691827 +oid sha256:1822131d917a1dfe975b2caf97483a1c3337723bf68f575e7089971e6384b94f +size 661866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 58e395d822..09a7310b61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77de1791d3e32c45e70da9b0d28f52868685ad6868ec774d79d192e5519f2940 -size 819825 +oid sha256:5b49dff21ca73f24ae1038aa0b9e395dcce914daa19419dc541b4303bf560c03 +size 780392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 517308d6a8..62d95e2058 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2765f977ee016b6946d52dd56667802e679a444d064a72702b006283206430d -size 728551 +oid sha256:40a4c931cfda4d329d6b15334a50b7524a715936753ff624692e882d14460951 +size 687194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8132329721..d50700f796 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20c8a484306345c22e8acab97e2b392debdb648d5caca4b9739b1af9386b9b04 -size 795109 +oid sha256:d31d88b27bdf41b19230e1a30d707911cc074231fa7a3f424e4fd559b260603b +size 764802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 58e3bc9933..654b2e4a0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9cafdee1cee3462d9130320b9dd16a846c0b83128099d2b20031052515f9e21 -size 704921 +oid sha256:34ff454020a1487167726a903563ed8fab258b375887656c40e6bf3ec5aef79c +size 669730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8b3f36b922..11ea08db81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ed57a5cf4e3501a42a7bbcfb31abd8026c8247aea2c327658a011a215ad4fc3 -size 821995 +oid sha256:372ab8a1d7c928eee18c780b818061a1d6f67ec4971f519c7c23d5733348e150 +size 782610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a407348f63..d2113f661b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fe2ac62aaa757598c41bd410f5af8fbdcabbde4d6b2884abfa949b11268cf39 -size 733631 +oid sha256:6c3e5089517ae8612445bdd450019419a4592a0833210bc0dc7220fe643224a6 +size 694048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a3b8e50294..4afe5287fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ead5385c89cea2e15e111658165accba6b9d1705d6cae132c4248e72f717776c -size 796587 +oid sha256:c6187fcde987eb4adb54984deea566bc58758febd8d0b29e4e8e63292629ecb6 +size 765048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3f355a9c7a..89c96d4de8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bdb571547e4be060ee5fc885a95798a7e6a07b10aea51360f927bfcc9c8fc0b -size 708371 +oid sha256:b28921526f7fad99332e7c4986299cf59741d5bea0f7ce4bd0f4df4b8352b6b2 +size 676584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f8b41ca39a..74e2efd9c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b6c085df62ccacd971201ea99af5ee1d29626fc74a5543ba553e1a194001aa4 -size 889269 +oid sha256:77469688daa1eb82f6fdd603afbe5de370cdc310c74d4b6be929a546d57bca99 +size 851020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index da7946b501..48e5632d16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46d12bed2bfa9aa8d0b73cf7a33c3f0b23f77a6c47521f5de0212dfbbb979f1c -size 798093 +oid sha256:94d4fb14c61457a2076ad78f2600bf30eb3088de27d0e5b1afd9ba87221c4f2b +size 760238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fb8c6ca3c4..ceff7584c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c36501fb0199824a0f5d40c07562dce121d94df685042190c704aae8ac69f529 -size 864603 +oid sha256:d229b0761eb0788f610aec0cdb57966cd71790706caf9186f95bd1e9f77857e2 +size 834000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b522571907..05e65f52f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ed889cef762b034ecc71d64832b0e92f146fdd63d27c691f9c0c37960c0348f -size 774513 +oid sha256:1dda24df45e19eb71dd045a09cd36292062bdf3a71e1bfc849a87323639bf367 +size 743614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ebfcfc306f..e543a94044 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fa1b43f0be13d6495ccc5ed5ff02d587655e2b1d13b620e18b6cf89d82ef870 -size 661563 +oid sha256:7b9f24c851a8410576a3f55890d95de8df87efa24b58509431ad8738760be115 +size 638162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e5aa2aa745..8f26700f1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73a3578d6ed48c92c0a27981ac2f7edc5686c7fff5387608b57fc01c14b65b01 -size 558150 +oid sha256:5f30d2210f7883079fb4423d31b5874514c30b00cdd462daa4905f905ad1f389 +size 541805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ca5aeffed9..6b9a716807 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1552da960f9766d2a74d63e8a88ccd47f1c5e3d943bdf29c47de70a4f8c014e7 -size 659685 +oid sha256:0e8087ac14c6d615b61ee0865f8ff412f22997bcd8cce1af34bdb669e2757dec +size 637420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 8cb8d0f5bc..3efcfbc4cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2246097d92923c6a0127a22cd58cb3c764734451fe8829d8a201b8ab6d993b15 -size 577586 +oid sha256:cf9547d061ef6737596d748d5beb8378b39c32428142c47cf276171e18e2d32a +size 556603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 74db97d4f1..082e81cb9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:950e0b8e6cc16e89e7b62686daebb1dd6ed1efb8e4bd57ca5234202c6b88b828 -size 727257 +oid sha256:5f04ef437348dcf37cc3e3a2917503caf8fd6fe6d04e43346fab3fcbc8070fe2 +size 707260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 11c1fd0a03..925439c2e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c132bf82b3ef7c3d6d824f6d6661197ce349a56de6f0a1e9ff2b658f7354340 -size 624833 +oid sha256:669660266ebce88f655ba6c8c861fed1631633ece834c1b44b7050d1a5b4aefe +size 610115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 70976a6475..e68427bfc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5030d38920c30ad48254d5081c8642d5a6da7461707f431d4099d1b6ab14cafe -size 805469 +oid sha256:f8f013c2db8b21c1a983247125c0dcee428448fbb9c612eceb2bbfce0d0eae7d +size 755330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 041584ef1f..bcc77cc1e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91c069a14984b784d6a104e5ffadb9d29b6240c2760105674ca1d40b60c90f9f -size 683461 +oid sha256:640a5e1f1e65cd1ba8d2f5787a2bb9a20b96b5acafbeada72faf177edb0c5eee +size 650390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c14124f045..e203bab744 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:238a5520d8b9fdca7d42887fdba857f885f87b4317863ebb4b3c42a01c8cde93 -size 751153 +oid sha256:2728a39bd450dbbec6a99f5b549a53dc72884aac6adabd10be53a3efcc8bd7a2 +size 727506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e405ec2b81..ca72a25946 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6bbe3af8e826ed340f4b78feb559725045a003f4593a3d1a03ad0ba14dbb958 -size 645917 +oid sha256:48af9407d039afad51385dcc05b92eee9e51322b337b3b6f62fae368470c1f6a +size 623356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9f914c5e7a..eeb5b3a570 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2abcce2065ec670253ad5a51d825813b469f97522c60e6159e9dfb2d510fde74 -size 807441 +oid sha256:78ad8743ec24a5ea7e18fd1b1a3782128230c30e50a28d159988d4caec8a5092 +size 749408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7d2bfbcb53..1226f56d48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab2b262f98d466bd3f82c2ac219071000dcc6876a5a11a2b873b28ebd0216e76 -size 712959 +oid sha256:72bc894b28c07807169ed0914672d81fd2cca9cb4c18cc1931ec4fea4a2a32ae +size 667458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4ebfa3e023..1fd04526d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:636a12051f3695cce2f2cb18aa2dc8207097cc32a155052d2f9fe580c7958263 -size 752335 +oid sha256:a6745786f5b53a9c492c810860181ca71757d42fd968927abd8bf043f973329c +size 721634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 257c78e040..0d2c24f3c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:781484f353ad75a5104d423aa0972ed9b74c10e3e10f5d991710f75dc2835199 -size 668559 +oid sha256:d41e922e61405895b8c268c630c426963c75b565b8b95f867c5983f4f86ed609 +size 639684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 341f44e5b3..859f7dcdca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a23a685743a923d220cef430e98335e126ab841ab806fa318959c3bbcf4ba4bc -size 875999 +oid sha256:5ff15e7e2c8014ed0b5d17078de4c224f5c250206362f0ccc3b1990867f77548 +size 819742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 639e3c2990..5e4877dbfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba73a2740ad9dbe788b85c1943d5aeb599c73285da15831dc371bdfc3d23a736 -size 752213 +oid sha256:8007c941b227badcde3460f2387861338fe3ac5a10cb5e6fe46602acf7ab8d7f +size 717762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 46bd2de7f9..dddf159fa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d210431f5d88d1313dc781eeffa1846d66db199d04ee62557e55e915f3737e87 -size 822769 +oid sha256:b959b5ef2960bb03be5531396da8decf15e067f64031ec57f5e6b29fe98a3ef6 +size 791918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index da5816042b..7f698a61b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5de5cd4a09a5a99164400ae35649a9a7d6b7df0ba3dffc692f4f82d7bdd1c79 -size 714671 +oid sha256:164925c4aea4d837dce0e28a0ae5de968370d4791d572a7a2b5efb4c87a9d1a2 +size 690778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index db5602ecf3..5c113ea148 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a5e57d8fe7f80bb6c9db65472236deaff846ef5f08a986c83753b99ae150a00 -size 667479 +oid sha256:fc154c781edf7cfb1961a976067bdc8be4042a077e057855371e31f41299485f +size 637812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b7ecfa709c..0e5707dc6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59f63141dccc57497ac283b62b4a2922c98ff311cab9bed5ae15cebb136a1862 -size 578422 +oid sha256:2357f5713a777291dc1fc0ad3279249925e43ff8f9e7cbd7ec9b5ef7c4bd1fdc +size 552605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ad94a9ac30..fbdc330890 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c928a8852c63a407f50bed8d9fdc6ebc12a580dba1b3f112cfbdbf1779197ce -size 669401 +oid sha256:c675ec723303c50b461f3fd7cb896a2590378a4cbbb8a13307079775ddacddfd +size 637810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 84c2380621..5fb6dee4b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff814f584fbfc972cf8a708a1842c8cb25392dd132ed2d4c45cc762b36882b8f -size 583452 +oid sha256:61c6ee1e3f89c8f3ca004d22c2137ce5cde3def15d56eace17a86e880db7463c +size 559213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3825f2263e..eb61d91af2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de3e0ca924a7d01063ad76319e1d11faa7ba9aa100527165ba3b224865e302ec -size 734653 +oid sha256:cc3fad10edc968dd8e2bbadc3c64d78bb8275426302926ef96a5b23052f48c32 +size 704840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 97fb3c7edb..22e68c0f0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8724365020d48c6ac188f3ce94276fb68e3aa0b9037f35fac0cb056ab46ed16e -size 647177 +oid sha256:628abc0191b056adb755df58adb2ab40e89cca5e4e7369ded4038cd41fda603c +size 621556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bb0aaeffcc..f46bbb479c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70b25e26ca85831255528d55f9d384f7cf742775f9e2f6c8b4f69e91be54b578 -size 759093 +oid sha256:e376e0bcafd04286da735b01e313b9ccac7c6fe8835ef68962edc73442600c32 +size 725628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c3f5928e2b..634a5daae1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06a06904361d79f007de91309ab274182a7fb84951f87398413519afeabd673b -size 667177 +oid sha256:41541cc2ce462ed8fc3dff2fb039bbe1fb9f6ccaa27d87a73926cc5ae0cb6beb +size 632578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2eaa120db5..0a535c5933 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:188badf0cf4d059da45f4e575853c924fd4a2aac01e623e05f10586fdf9fbf6f -size 749373 +oid sha256:f1b9dba526a1d955ded4e8e03d04fc5a0fb3275b682990c347bb2e55ae5b265f +size 718474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9b1ca6b59a..01d0f4bfc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57fca7d14ae5fb3ffc4c167677be89b78355fe66d94d13b8fc231902b084d75c -size 658297 +oid sha256:3a7bdedf0e3cc3f0d8584ad0620746f0e28f8724593762cf353e989284f82b70 +size 625474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d2c7064e80..616aed9a1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:994e4c433e1c67ecb1e473faad1f160a496c2869064fa4ac4b5e40eeda0f2774 -size 759435 +oid sha256:c025a16ed4e7789cb40a1266ee285c81cf30eca4ef997bac38e805ed28c9f59d +size 722222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 419aab345a..44e22063a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b48479b1e74c7feafaf4e26dcd3284aa1f9a996bb933f70f09124e8a1bc0cff -size 671367 +oid sha256:7dfbbc834d8566ae1338fdf45174ea893f966756b1dcf3476570c5225d9bbb17 +size 640320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0ad96cff74..13c809dbaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ff27eae6529a38c8b720c6af5fb0701c41e26f725017dce58a266e34d0c3311 -size 750359 +oid sha256:17efd6e35d3ffa092793480b7947b67012e1e2c018bf6f620c2d74aa7ebada56 +size 715168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index e3ba9509b2..ef75f4442c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dc1546918130f85f35b2cd80955f46e36c30b64281500ddb202cd7e89c08663 -size 661747 +oid sha256:055e64c4730aceb204b9ebf485c3518c5a6e9f34a45922b4e4a9f10add4b3556 +size 633168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ae6583df0a..9953caef56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98430db05d98a1aa52cda0fe80fc44ec44ed7be8425f1e3f62d39eee543efb03 -size 828141 +oid sha256:e808bbfe9a8a59833100d1a024eeb73dfd578c59ba2160e051b98488398fbe0b +size 794578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index eec5dd8fcf..6eff084046 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14531b6c17ec208f08ae37f10cb7e60c01993e1004ff723931e8cd0c904e4977 -size 732723 +oid sha256:08e5b6bb838e66a5279dbe620cfa07f5fd25ab1aedaa6deaf172d73140c53fa6 +size 704734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index aca39a2055..018e8d69ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9636155dc96dafd33eb672ef968bc43644e89393299950df469239dff4d55d5 -size 818373 +oid sha256:129d5c2134cff214b39a351ff82bc411b605f1d220080b7d3495682d209636b0 +size 787426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 79e650621f..f042f8af81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba6658ef3a14258800088987a5bcbd48a98a75bc5d8e720339b0f84ffd57cfd8 -size 723843 +oid sha256:650fea529a2a6c2e586a5d8582ccf2b07086980f5c4e1db67a27e531a7dacac9 +size 697582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 03c8dfc57c..9bbe26f2e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecbec6aa4ff48e1abcf1c240064b535540e6f7c68b1ce95fd464a6054d5dc71d -size 747269 +oid sha256:287b5fc5d1dc924b7949d917fe3eb98221968e389f3b304e452f115e9e898894 +size 715974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 458de30b7d..84f8cf1b82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:953726658e372d65984b94ace2d193677f203272439783cecf2c21f0b864ec69 -size 656487 +oid sha256:04b7a6fb47d15b0c832f4fb99aec445a7bfd1f8cc0be34a804755f63b324a53e +size 625046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ce3085440b..ec53b28a22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d8f709f135f2987de71129e3ff3bba7fe3991d4514b2af430596e87219671a4 -size 750423 +oid sha256:c52180a7c89917bf29986ef9cc5465496cdbd178a61c6691df797b01d4c7e400 +size 720116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index f126f199ff..b9d60e0b80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2035832696ddc1e1eebca7596a5f6ace972bcb1ba8ad92c5e8303f3d08c4dbde -size 669805 +oid sha256:d293a2800004829e8c90364b96276e6ed7d792ae9c5c2907120c74adfb22e63a +size 637130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 72eb223099..005fdf41ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7295335e6fdd4dc4677a73654102bd98c226628a152fbbac77de2868d65d070c -size 870111 +oid sha256:cb3dff8cb6c0acbba0bd82f4ec5cb6126dd68c07e1cf0edfea68de23c65d2dc0 +size 823228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d15518e37d..43d52a5e7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:777a9138ee48b5ae392e9a808b525c1d33f3493207031045ca9e16485c6d39ee -size 776123 +oid sha256:d366855ed56bd30f4fb3f7275d07a5995085a1e514295a3fba30657fc6c763cd +size 727760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9d024649bf..3148f3a861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7a3df6ec470073774efc67b3d4598296209f4c77dd41baa28990b5091569094 -size 837649 +oid sha256:5c58d4a462c91faf5f6dfed21a116ed1021e5c63bbb449d85d84e723f87dfa42 +size 804184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c6e4f5b0d0..24625da185 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f039d90525cb66fb9f1887c854fd31f1548fb733404048a56e003c3d4edaa854 -size 743463 +oid sha256:e5d4298f87f2ac5602fb17dc4f1ce00d8b77f0a17bfeac1a4476c1c2d735adf9 +size 707040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5509a805cb..37f5656276 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa2cae51e84e2bd21613e61decec8e6dc4501002191e3b03173c6942625d56a3 -size 872673 +oid sha256:ebe6f6fecbfa09d543aef5e442eebde5a7173c02287a70018f5ea2d3203c4984 +size 826630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 22ad7478ff..601de6a024 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a6f79556e276c49b388d63ff0e3cd8ed08e913bbd49066b9c686f4c98714948 -size 790081 +oid sha256:bf21cfa59d36542e673d3f1f28d60b758742a7ad7c701c97f4d112826b087e83 +size 750894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6b3406c8db..0cd6f6a061 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:820260c2402e6d2883578a5e2f8302d0c4e80e7e5a62d089622aef7b756fb8d3 -size 839029 +oid sha256:048cb357c05a2f4cbfbfd057cbee25ccd58eb6741d02e2e27371798bd2cd858c +size 806304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 06d4259038..c3cfc831e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f08c5c8f2a1c39a3df58d77e9011f05c0956105d6a3b730bb505d6bb2242f9c6 -size 758409 +oid sha256:de84214286f486deadb2c596d65187236f866f9927d1edec80900541aff9339e +size 723810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 61c3cdbce9..c149d0d2fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a3bd2461b16ca183ba75a46686d3d8afeeffb424836c789fcf5027a7558dcdc -size 748061 +oid sha256:36a85526e4ce3383196adfbb26df406141a4976f302e6e9fd95af05e03e608fd +size 716768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c1f63b93b6..ed36db621f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e336c870d27bc16332032173afaeb055b87ee47801faf9d131095a8513b96ca -size 656491 +oid sha256:963815c735e546f92b3bc55301a8ea632baa42593669e46b9b13766e7b1e48ea +size 625838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 22e2490207..ca7a90c49d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99a7f66e1c4018ca1064fcd25524e2d748bd093657664f5739dc1781d6a94b07 -size 750427 +oid sha256:af46ca7387295523fcd26aaef090f86410ce0fdf59148ec103ade210e2e3e874 +size 720910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 98e8f1b5d8..4cf48af957 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbf9afe19d318679e25a8d838739628e22211152f6005998dd9f883a11bac2fc -size 669809 +oid sha256:1c5e9b5354dde6a2124232344c19b9557f4b592993c26282323493e3b694a088 +size 637134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9b4cae7f02..c47f386746 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2c061f43d4b3b84282b919d03f22e494cbf4a3aa1081f5c134cdf37b66bdc10 -size 654303 +oid sha256:e5bcee4ab3fce235820171776348ebb12574fe96f3a6f8637ddb1261320c196e +size 619308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e63d8fcf05..551a5bb0e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0c78f734b872f8220c42a058aaea2ad4abfa52f5b3350721af06ecbf9cab4de -size 570970 +oid sha256:185dfcb8199b8f810d5372763da351f8c8f71e46db2a2d99ae6bf246551a27fb +size 539479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 5af82ac1e4..59cc06a14b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3e3a8db73e946b7f48aa88b3742a5edf63990edae4a127b5541ac57fdf881e7 -size 659677 +oid sha256:6587ef704d2992621cd277a5f558d217d3bfd2194aa3d7f44b19a5f29254bf4c +size 624190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 51e29c5c28..00f57acd7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28dca54f96f2f2dca8908303f1f2fe69eb620320de136a212c58dc50e83a4010 -size 575160 +oid sha256:e43df4d9a08fabab1599a0d17e20f85a55eef8f1af973d6f9b34ba4e7564c419 +size 545445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4674d4aa62..92d9d1ad59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:148fc984eef4e1686d6dc35de010a93e1715cf5b42b32a973f1ff7943fe337ce -size 724437 +oid sha256:d2ed3021ee8c6af99f8db3f233d146a62ca39f04737babc9e87033892f95b41b +size 686780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4497716000..9f7d04d195 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14fcdedde2b56f8da5a5b7144b7a6b29b561be51a013cd3bb7dc9d5627a69116 -size 639279 +oid sha256:1f5941df94433f51fdc887cd2b763056769f79f803a60ef40959443a29c39f4f +size 607097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0bf2312bcf..1fdfd61560 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2ff9f62d420f58b69ad453b22e84a78415d83821617946752e9a84296792fc0 -size 750209 +oid sha256:0fa6549a2a463b714731b74d9435149951074d37f0fcccf48fb17add2565f6c1 +size 705842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index eb40f2ecbe..721b411f66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:883dea7b3a53d60110c377ca5dccdcf40158ec2b9352997e815dc077c2d2d3b7 -size 656269 +oid sha256:ed9d31e53e030a500a117d5c4b9857319b2a7ea4beda53985d582bb0e50ca8fe +size 626062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d865c91f9c..5376e8c35f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8505ae6894b98f7cb115e3b8dc8310ac941fc5d3f1be546c4e90794acc6bc48 -size 740489 +oid sha256:ff00c1c50298661979ba5fa941c315df114e52f43f5b0277d0b5f8d17d062e29 +size 698688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3be0e2804e..c923a6cfe0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03051b96467c080acd6fb6cead37545deabef1985307e53f8add801519b522a1 -size 647439 +oid sha256:d2fea12cbd81bb152cf62c1f756d10097549f58e489516a3a82ddc4a56831281 +size 618958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 09b49164d0..8334581b7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b9c54fe6fec9e73b196cd8374ed3cdd6fa7ea26ee821ccebd05d230405f11d0 -size 752377 +oid sha256:6c8e0562394c465b3c8c0fd72e8fd373bf9841dc013f7a80653a5e3d9d959711 +size 708306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a722799d85..558f817806 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df37ab26f68b1b4f1d8a22af96f0328cf0adc489cc21e35245c61e43eeb4c636 -size 662435 +oid sha256:95c31154f74b1f4c196e1ed895d129076e68fd3a43cf01b27a65ec9c64b9e321 +size 629710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8c8d25ef68..cd3ec6c36b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c0ece162b0522e662cffdecda4fbd14c07df89eca6dcb2b8533644722775a31 -size 742659 +oid sha256:b8bded47a8c0f2f133f7b4f5c9eab65f8c90e81be13875315a197b608069dc71 +size 701152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4c49577646..53f17816f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ebbb44e85485a3e5712fbb25602bdf5d5cf788f4f6132393bf7d856c298f79c -size 651087 +oid sha256:c71087199b9a1e2d6358745722397ba6a2f204e3ad8d5b2d3e517936a829674f +size 622556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 12860ba9d8..aa4adb88e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07ff8573017d1e2130ea4cfb6d463033a998eb155837cce616fa15790534baf7 -size 820639 +oid sha256:7138fe9227eee0b61bb07a2d65929d871f10803819e6b9fe26af072cc7593a7f +size 774842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c81d10df15..75dd704743 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed1a49cff0bf5e3357d06294d55f1e8a84e19c03f2f2b084fed09447a7f166af -size 730795 +oid sha256:7c2867e0096a21973b99f4849bacb708be10d175f37a1f917185dad57ea428d3 +size 693138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4ac67c561a..23a4041bf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eecb214bacb15fccdc8533e9a2d4d26cba8c686acfcb340422e2cb4216f44e6f -size 810921 +oid sha256:0d97531092a61329914d60a81a6483a9aee24985d6c47c5274ebfa2729d40586 +size 767688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 76960da893..93b678ec74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc1a4bd327b5e806813381444989f52a06fadbc6573fede8b1d87b569f3a1745 -size 721175 +oid sha256:97eebff29078a3d53fb26c8eb70718a674e5d079bbe371d89b7a12c2817d902e +size 685934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c1ac3547f0..6b040df0fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79b40e472a027e06b449752bf5c85b326c41fa073aaaa1a3e9b4f7b631bc0355 -size 631855 +oid sha256:6fdd06378d558fe5fd62ea6bc65c0c37466de6105c7e9233c3c2a9587174254f +size 616003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 60a7a62e95..8c23a94432 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b094a507856b1c117b3c0e8669bfe2f5ca6338ab0fdf4e2e13c4cedfe18b6197 -size 546154 +oid sha256:8f8d6181755db5eff0834fc15ee4becbea5b59af0979ee519f572546f53820c0 +size 529315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e718b9a671..ac2f15777c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4ea25fdc6145582971dfa810c976d40a9a24ada817f0e2745fe16852f0ef6d2 -size 635455 +oid sha256:461db4882535c2877a325f5768697e5f9ee6694d281b167c7aa10afdb5e9a776 +size 616543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 4c03e01d32..96e6ac95e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9385c5c862ecce66ea67314f37cb44c69f0cfeb52f7b2045965e3c46d09686cf -size 549360 +oid sha256:2c4c42df245dbafd18bd7446862df9b24d270094da151cc9ef996e888c781f97 +size 531829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4e85d90c0c..7037e4ea19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5270581acf442f7838e2039f8345558a25d22222499fe1f1b9970b531b8abd1b -size 704605 +oid sha256:42fc1088db30d50b07fb2f6fc5d6c6c0d752d0c1852dccf71934903bc27cc62c +size 680810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3c6266aef9..6bfd3c494a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ce40f931a752cce6abc67d15aac4c5556d50372f57c952e1f078ad431e46fa1 -size 611998 +oid sha256:579621e9192ce833daf89f585177d7e2023e9815620c9b9495a05b51e90b76bc +size 595997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 15fae415a8..075c64aa42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9af0f4082f11ccc2dc2429ec8615ffe07052bdad3969fe7bed63bd10b4d3f869 -size 730525 +oid sha256:0f25ecc7c25b3ac32116c15b1757cc5e59879ccf3866d7f1f8fd226f369a71e6 +size 704262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9f6c4fcaba..09c5ced31a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c4b179ed8def8aeeabfacea5c857aa592a49bd3b33ee57227cf6e401df140d0 -size 645367 +oid sha256:6f59ba9d873112704acb0ba18c600b51404132265db6e4910d15f0772a844c22 +size 618908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e736755e07..d3d057188d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91652be77bfecd06d6c0ca4540fb8495adbfa4d8b0f554d0db843ba783ebcbe5 -size 711037 +oid sha256:069a695e8e6791c21d0dcd8918ff5454cad4b8e482e8ea918dda3489ae5f04b0 +size 690746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3c2f523c75..06196a241e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb005e11140e2615ac781c307584b9c68e320d3be08f1b5685ad438dbaf9fe70 -size 626817 +oid sha256:0c9a269c7ab076372ae5024fadbfdd90a08a65d81ecd684f6c0a6ba67a62a0b6 +size 605389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6958dc866b..a623f599b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89ef0f2daa94c54b76a0ce8fda683271a86e196d8d7087b429f8b22b91938d70 -size 737627 +oid sha256:b32e147b2314b8f910b81e5b4269405f955041308005644ce0cc5273639906c5 +size 705692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 86bb900d59..b968b95c27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1087948c0f4996a465b0baf70e45551e12d064e1806556d2306a8bb0a790f5cf -size 649953 +oid sha256:93bf442980b2f459fb013d178c009e086f3df7c4ae0dd7eb42611fdf4c486260 +size 622014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4589e789b4..9c4e169b7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89ee3b2ed552dad2eca9465d27f84ed0862005031289437ba0302ca76415a138 -size 718139 +oid sha256:213cd4347b30fe8fe91a81e4b2308152c6a317459403576b45d10347c97851ee +size 692174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 971552bab2..4daf066574 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5644fe7961b95db9285cb6a7ff8231f2aa2dd2d0d813f5b2b5ff2ad3f3fc6d7 -size 630615 +oid sha256:c9485bd60a04aafb6cc0b1bd599ade5e910bf619027a8ba1b6c170ac409132a8 +size 608495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c50a25a772..9f05853754 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72891755605f63e7432f9a7573558a6b028fccb1053d937f5ebf2eabf8a6403f -size 810377 +oid sha256:2a38ee9993ded99325cb3d5cf64fcb879ed02b217db18b005e7ff62618784483 +size 776420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 490970831f..415c7e2154 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28962c076114dfe6b35bcb434caab40ca16c79a7f51164851ed92a7f9d77d746 -size 714269 +oid sha256:5f72adff5afb50df5dec137ac8454f1a0ddbba52b33da78d7131d26f2c003ecb +size 686822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e2168056ce..a3e577ece8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ce1aae420c84388c6c1082e58da97c368838e5bcda2b1a3e6ef7cf5a1dfb6c4 -size 790941 +oid sha256:a03a4bcde0ed64ec214fee6329d1f48aa3b99d9cd0899066ddb478e321b250b2 +size 762952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 774987ddb9..4c6953142f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ccc076592e29fc977b5d513a09708f88ccd2210749da692ed9d0d10937bd20b -size 695719 +oid sha256:2cdbc720d2ff15a4085da1ccbc3bd97b638bc6a4549a57a2b61da7822ffb14cc +size 673306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c5b1954eeb..b242a2bd11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70e44e505911b3b6c16357e96b7d98e85427a8afc993e01398a83b7f5097051e -size 640731 +oid sha256:434d6a551326ebceadf82ed8b782784f7d080bfb00f1717f055571f6fa7d6f81 +size 610375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7403f62277..3531efb034 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aa277e947443b714a887295facb79c8dc4a47e7f1400ecd295257ca8f967bce -size 554834 +oid sha256:d3de12f5ee8f8a025aa8ec0d90e41900514f908a1dfeb22a80dfdb8addaa1fd6 +size 527733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e633290924..1a4225b12b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b4598037422e61964e6abf95ec8d6bbfc3c512ea1729fbb86291c8ea54ee4da -size 643147 +oid sha256:06e0987623556d4e3d51900b48dceeaf4330c695edd82b4f072b4e0a6e30016d +size 615207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9f5dd28f43..0c9bda5598 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1bd4ab37e7afef814ce973fb1d8a8c5001db0413fe1cf3dc21ef0d635a291b8 -size 558236 +oid sha256:6bfce99601ad08ebd00545d4b007d423e584bf146b8cafcd155b4217df326126 +size 533799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8c08603467..ea747d2c27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e45bab6f12a1194a12bcd812a02be8b6ce918e271eea5a8a0cdc604cd9a9f16 -size 707019 +oid sha256:bf13623b3b9c09a98b3a63d43c224ed5c9cf443a5c45a936c0d4208467170a9d +size 677796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e3c41ae0dd..fa17f0e6e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad59f312c778c2dada83d889974cfafd56aa9d5c4001b4d4dfdc98eb8c9fe769 -size 622305 +oid sha256:cf46e81d5fd1d0ed6579686c7ab3380b514dcd02954eaf4874eb74d11066fc9b +size 596535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dcecae4c37..d170d99714 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a4d1eda10f96ecf4c0b21d533d5fe6316624d75b541d3ce89bdd977be46d8a9 -size 730323 +oid sha256:13bc8ce4d3fbf45055b3f0f5d13301d2e9c629e2f2b72d3c7ad8c6e8f0d0b75b +size 696858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bf38ce2046..863556ee83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e25d1748c6beec9fb71c87d3aa89cb70f514eb22ead0b8bf827f33ad49d81817 -size 640085 +oid sha256:98d6d324721bf32678934429d334fae6c77093fe5340a563ef3bc2300beb8532 +size 609925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8b2baee6ec..216d23e017 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8eda8b9c3cb42c32a5d80fba09f59350a8fc51aa06437788cbc7710cf23939e -size 720605 +oid sha256:b082b90d0db2471611197c926cf458553301da4fccc2597789498538b4026190 +size 689706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 32d1d32bda..27e0958bcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5581cd839bd6eecb4a4de37d3896cdb1762453962399195ef1e65abe10ddf974 -size 630465 +oid sha256:463e4e81e8fd1b5ec742f4ecd00b0e4cc9d1ff512e330c8c293f8937e3f6bc56 +size 602771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b4d74f129d..6a332d3595 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c6006ce9f8cbcb5c528d0d88bb00c9460deda1c0a4fac68829742d1eac8c9c3 -size 732491 +oid sha256:26270c73a275dd66361bfcf2446fc366b14a8117b14143b25b4dbbed8b78e05d +size 698534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8427e4bd24..06a917d89b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7291fbf5d0be7cbd1963d96a1ff71dfff9fec75991714dae1dd9357b9a7af45f -size 643733 +oid sha256:874db82a70817001d2860913ca02dcef7cb52441cb215e88212b433d58268b79 +size 614609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d5a9b2029b..d0ee45f0a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0751a96fd75eb568bca7b3b9692f41f0eee751e462b574ab6eeaa0c507d88759 -size 722773 +oid sha256:f9cac50820e52862b23c2c04344934e861a48563ee6aaae426392a30d46a987b +size 691380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3f7dadc9b8..a78a4301a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f2befa45ea46e32e34d957b247d6e0eb485ddd461c5bfd634adbc8fb7d7497a -size 634113 +oid sha256:93eb42f1a9e3b7bcc85cca7afad81c47b94b681aa872fda2f7a31402e54ae98a +size 607455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 64347fb73d..46f3a59ca4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96d300d324755dec3e25186f19f5b8cf84fb5703c6a3fd8cbdf11c577e248358 -size 803073 +oid sha256:2f9bbda60c2641b386bd73004a39e5f1834c9736c9413299df3d7d5e42283b52 +size 765858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bd5c0ba0bf..b71e5ddd08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5840eb0705fb92791ac0fa9d2604fbc5d86058cbe16ad26a3c028a55a74aea19 -size 712093 +oid sha256:11c8099239088337b44173fb8c1579fc7ed5cbf70fedd4747120703707aedb44 +size 680454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3db05dfa2f..c6826b4d6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dbffa226966f03f28d1b59260f1da3c29e5f883914f4d348b31dfaeefe0e8c0 -size 794093 +oid sha256:a9dc7cdf5b97177d682f3f51d21d141fd70bd95255ce5ca9d1a6a3d8cf81ccee +size 758706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 524f09131a..44461db95f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5376ffb89af1ea11dbbb5ca9008eb7cdfd0466568f427edd22095e076e570c62 -size 703263 +oid sha256:068cd4381fdb2757f8dfec6bfa90ecadd50f2e72a710bb38357785cbac4a7a02 +size 673252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4251ddd1fa..f14d0ee60d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f227305921ec57b5bcd48452686aa8ab3daf738747f0cf78503c0cc5712b4ca7 -size 684953 +oid sha256:630c7ce9456b8e472ae230d1d70bb794e4bc7181a00db2d015ccb26f5c1e4606 +size 646506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3e0271139b..863fc7af46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e6f36a1c69bbcff789edf9050f9d62c4934eac5e55d4db65446d0b523b00398 -size 593134 +oid sha256:c879bd6c8cbd05ab4e9ef92a85e4c35e77dc0122af2822464b7db539ed660988 +size 560163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9538f987ea..08ea67a4c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:222e5b1cc8e56fab03201a80c7d49ee8844d898317ec82f68d67381a272b8942 -size 690377 +oid sha256:cc59c8b442b9c2ab428af7c588373850fbc1a802866af4deb5f93dd82ff307c2 +size 654988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 08ad7eb5ae..67feffc268 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bef289a5a1e9cc35bb7a3b91f0acdfc7c84e73626a21c4f7abdcb09a5b7d311 -size 606846 +oid sha256:e20041fcaab505ab5dd8052d9ac06436621264fc06245f1361d34a223dc0bb75 +size 571557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ccd649505d..2515b2e518 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3698ef895ef8f68ba7dabb8eb22dd113b3a4b5ab0087bf3467bc6fab284c6433 -size 780365 +oid sha256:cc6ade3ae6f5afe1c137e43ba7073edcf2c67cb478904648568c91f4dd074aea +size 737478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 28b15a044b..c74deb85bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47e0f94ac645a20e1cfd4c47cb63ef00c5821fd119b71ab0c4aa5e6113c1f1f2 -size 686575 +oid sha256:36cfc6110c69f8034b25bb8c693ff2d90386ab93e3289b41af7fabcc8514195a +size 646696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2b07a2104e..1742177464 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5116686d4ce77579ebb2e1a1f39a368fd36a18cfbc3d6e9a81f82d039d045c23 -size 767637 +oid sha256:5cde12976e4580c9e4a2485122cd28be168412614c39bd14e6f1d108046cea74 +size 727956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 24b837e5ab..8cd0244f0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3ffce4cd8996d63a87a34f3e13ae5f7b76349fcfba049fb2b9bc0f257d732bc -size 673847 +oid sha256:146c10b28f61acef693e07a421f7caaaee8cc251d37a306fcf528a7bbde15972 +size 636386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7e31e62922..5acfeedaf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:627df41b60a06850f96965a19746b623077d0c3d56ad1ecf29d6a11ebfce357f -size 784803 +oid sha256:f8a19aee2dd6ab3fb9c24b71994f0125405352db0239673e2524e7f944065124 +size 744136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9cd4babc77..a0759d2919 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4218f5d9d8fee4aed3dda213b67a240f31f7623d0b35c8f3e31be36e486a2624 -size 702605 +oid sha256:7a672dd52a908fadef9f8849a6fd92f2b3e4ed2d9670caafb580e25ab2827ecf +size 662382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8aa8c42f32..7db8394d29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ca7337c7998e0045e7efda5bf12fb9aff6b5908db1c353f0ddca215bd16e2ef -size 772125 +oid sha256:b3b2e6103a33d01371c9b85c8ccc861b5ebb7e39d05f4f739e6df004f005a3a3 +size 733332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index b89335ac6d..c27c6bcf56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a99fb3cc0a5f132d52f12bd80959b17e1e827cf0c3b4981719a6d0ffcb68a092 -size 687559 +oid sha256:616a3918df4e8728848dad898a26175fe1e042daaa21fb9438ef72fef945f163 +size 649210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index eb4109be48..bbf12bc704 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c4a058b97416472eab90d81e3f845e7188499cbd10cf55e4de8cd6608dd27c8 -size 684957 +oid sha256:d5dac2caf880455eae96bde3e1521434e3d3fcb84c090da08e641c4b1f6de644 +size 647298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d883c871dc..aa15bc0b9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:113f0cd184fde8639c723eaefa4864d6b8f711c72e34eeadf618aa9c5e14a39d -size 593928 +oid sha256:3f66fbb1a6c9f169b6ade2dabd62f3d649b493ead839927fe2a316a020fda4e9 +size 560167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 29a4614c63..3720af22ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46ccd71cd8a04533a9eed374faa6ef89bd6730914e65340cb7103b4417b83a9f -size 691171 +oid sha256:545f33b14b33427936ff683849fa86d9b3cc2f9b45895a5e8a1c80785fa79cdc +size 655782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index a79d1c51ed..b98a781f44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c6afb84d15226f36afd54c61b82e4b551a4904260e271b37809116d1f053fda -size 607640 +oid sha256:c1819dcd66236657c0ce7e01af94ddfcc1c3636265a0c6d89d2efe8cad9d07b3 +size 572351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5d3151ee5e..0a211f400e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9fcd25b4c7db083064965786582bee4917a42931e47a2362412961fb6993d10 -size 664901 +oid sha256:6b108fb10bd962636c22287272a0ea78da174d906e9b6472767c201b52e38a07 +size 639774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 039e436db9..ee7fe62ebf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db8d5016a94f36b26a7c2d11144629b7229104550f81e5052e52be2a64b3551c -size 579694 +oid sha256:77081fbd2b513cd44ffba4394f1b3de0a0e411465a82b3971fea5b2b136fb5eb +size 554319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d432abb02e..58913c779d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd9abe3a6de2ff3e58c45e883a6e34877badd4fd2bc5b13a794aa4ddd662b63e -size 670573 +oid sha256:ab375cc5bf09881ce138ace33e7efbc09cdcca36d4f9b3d3fff3a69f6b411538 +size 646482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 5bfd6c88a4..84852b5841 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:603783d3fdb4ff66142d7b661fd8a562b25909e02c0d1004f85f8ac32e3c8bf3 -size 584674 +oid sha256:e99393f93c3461291c34bd78cc51655b15bf23368255f35f9769606349b6aaab +size 560337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bb501eed29..bb51dc8aa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:497ec4ee1cfe8d6b5aee5ec6a1e643276a48d810a086f521afbae0fb833d1a44 -size 735183 +oid sha256:0d7b57abc1f106a6784d07625a487930d1fa93ae798a1ca9a9ed1130c8f4a939 +size 707244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 2ab7e5722a..6e2565db1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51c558a027aecb3cfed7ed0ffe9579961405f0a3c9c21fa82b0b6c8063356e01 -size 647955 +oid sha256:1413560f6475806c6eeec9e37aab2ca9a9cd87343cfc98a5ad15bff8304f5205 +size 623024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eb49590291..1792e012c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b50fc8334738d492332571bbe16de077bd56b153a6ba553f44d6bdc7722b722e -size 760955 +oid sha256:7689a048f90c6f1273828b8fff0b564d86cd465ce2b974f4417f08016e8c6b34 +size 726306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2fcc6ac9cc..3d17fb12cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:703fa16a4bada5a059dcc6b91628325b3104e826bca3b38e575ffc1b1b4aece7 -size 664945 +oid sha256:86bce616b3365b5747938b64baf1d5f925547f875c660f92bf8335e2113ad0bf +size 641298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2885653fe7..fe3115e913 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1927cafdf455f7f7146ddbb0ea11ee291195a73d10ff3a67e4b16ba43c50ece8 -size 751237 +oid sha256:da7563354a7e70bd788c84b7c42e37ae22bb9e86f40b6d0c48de6db930b0814b +size 719154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e69efd2488..270635537e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:325e6691a00b3bff9ad2daec8e62fb3b4ac8830a2a3a74545936a00b127f4938 -size 656113 +oid sha256:de0c44657f525b1e98863cd78c5b48c1985c0852da0a303d4e30087aae3c6716 +size 634144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 05cf50d735..357124fe09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af9eb1297d6b03c3bc2d7eb60c027196e0e6c9a3d52d90446279d95a134f31b5 -size 762335 +oid sha256:c505f5b4ed5da6730c6200180c14bbb575e3bf24a09f5b0a86749499d545ec4d +size 729808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6240321ff8..3b8a71074e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2c7ce487f764e5f5c12f8349bbc8df5f7c3b3476fd5779c7de718c1101666cc -size 671109 +oid sha256:4188c9c5f350523d2c32c0c2c97b10751341d0108a89b92a817a534224de2ea6 +size 645736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 512ce8b0f7..0db74a667c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc4185b62f92214f0df2c23e555b89cb8b4170c88f839be512c2e5987e0dc1fe -size 753405 +oid sha256:a93fc85a8b5896804f381d9c8fe3651a2afff5b4246246e6d5ad6a3497d54cd1 +size 722654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 722b98b3bd..0f3c429d73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94130d9acfe0d507734934c66b8cad35b7907a075b9ece689392e2a614f67495 -size 660601 +oid sha256:f6016359e93c3ff754456795a7fdcc1001a55a38602aba2e3e992ea37e243779 +size 638582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a35d87d250..d9b7e66be3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73a525928a30b16881d26d6c375bef6279b1f648dbe40ef845425ac469eecf09 -size 830645 +oid sha256:1c2b9a5ee2b03adb1713378dc0c5ec1e008a26136d88448e5f6fd311d2173672 +size 795306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index be3a8f674c..1e423f22b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05aa45a736544159d9e50a147678f377fdfcb9848b27a3d311fa339ebfec6bc7 -size 738681 +oid sha256:ac27a7b7b15fdee2aed0d2dbca95d419c87b5b02964e28b6228db18f2bb299ed +size 705512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a20f63c565..fc6bfd1cbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8a94efb9e82acb0378e5a890eb02d09d40a8985d1dae8b051b198657f77dec -size 820877 +oid sha256:68e8cde80373296635bf00dc8c880d1a90f08caf20fcfb488ef30e2a32e1c11a +size 788154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 14df93749e..ff0c95fe24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9972c79264005b0701c45ab81209969acd57df7d5d7ea4bfd9719a83af3ac5d -size 729849 +oid sha256:e3e4e5264407370d595a5369a1c5bbc190449ace8659ff239e623c2d5b39378e +size 698358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 65066d9ef7..1853c2fe74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbbf5ffb10eae7b987e68cae47459ae79ec161a37f295128b7b9cf920a35d106 -size 638607 +oid sha256:ff958ccf7b08d8a3f470da65cba83ae3e151dff780a3dd5ba7b2c54d83c79cd8 +size 626208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 93539fb075..4202593677 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:394df2e6de859a4da0317cde06d5734bf1431450904be53a38a97bfead932f25 -size 552116 +oid sha256:53399581df6b7137579d15cd2a42f48e196d89a754ae1233ac4db18a51da21a7 +size 538977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 281ed08186..8e32c38c7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f38ac476d0a73eebebf308f4d5ca0a5c6c36c882bb402ee9b1b2f0e16bbfa9b8 -size 642797 +oid sha256:64c3f15bc285c8037118637a7e8c973b513adcb4d163b5dbe50b43c828eb353b +size 626206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 20d778b2e9..5a14aafb74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2eebcf77802e9b4eac128cf46300e2526feae17b4eee6329767a9f730e1a1c34 -size 555518 +oid sha256:a8fb929dd4a6c2d4441ceda1698e97347a37902339b46ad244ecaa88e99071f8 +size 542281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 983ece0665..810be565d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08e84d9a66c69651b70e64887512b43b0932e817850f0cda4d41e90418fdc1f7 -size 703511 +oid sha256:44d06f175322970f20913dbbf5e8777ad550753dac59618d0bb5842bae029251 +size 691260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 614ddc53ca..565e6c1549 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ff9ef40aed800496a9622fd94797335f93919a93e3cc632195ba97215cb86cd -size 618799 +oid sha256:2681dcb0cbc35d109a94cdb29e9d4df46a66a6ce950b7065f90010cdc2340503 +size 605659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7d4622c7f3..75fd4610ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0716b2fbaded07e35169437642246d1661c7ed4a8176b08c99e1a8cceff1deb1 -size 737817 +oid sha256:1f1d0b527b0f74bf090c2b5b7fb0912ce72978d4aa5faa793b0040ab000da4ad +size 714714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 760fb225aa..ebdba6a0c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cdd50069f520bb1efe2f194d376a296e0d7275e97a6d7faf0d2b807c142e5f1 -size 652167 +oid sha256:9d1a0eac0211a7060f895f9ab5bef2d260ef8afa388335274e939f9cea1028fb +size 628570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 13fb9f069f..027fed4994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fc8677e5d1957982fe593c340f9b7f363ab7c8a6440196a284d4304e05ef186 -size 718381 +oid sha256:59ba5ecc5b62fd42d06dd3dab7197a48bb0734c843047efc25a6e047fa8255af +size 701196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6fa1467097..0118881d07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:495f0d0184c055d57cd8ba43a7caef28c2cabb3615c0f12da36f034cd9520f62 -size 632779 +oid sha256:b80f6b39efdf5e112f32fc52295d3e9b1ca31bfef74e46425beb2b76b29b7316 +size 615051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7c800c612e..9d6aa04534 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd6a4b882da3d77cf31e53170f6d1938671d024e0a9fd05f2d5dd1094670771c -size 745955 +oid sha256:7a34ff1d23665ac2ccf4f9c93414880841bf521b5ad74da680d4f5e32f966a9e +size 715352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c6bd888302..f785f01ac0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b27b8da059a9452058fb31e8340f751ec283a5eccea9581297f7d3770bd028fa -size 655963 +oid sha256:67074a5378a16e97332ed6e3735db8c5e49e7c58f6230d0e10557d51732561cd +size 631676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 66d2e2fa22..19f99cab04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42f483858df682c6ab034a70d168a7b559b65c2708a1fe0b69d1541f6fa32025 -size 726469 +oid sha256:72b6e0bba5bc7ebc99a3b2b4d5b44a3b33860162e94fcc58c6e8435d0ee00633 +size 701836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index d7ded16e4e..faa8ab36e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5f43cc289852816a0c7149e1ed30529226976dac108ff4dcd6fd804b6f07899 -size 637365 +oid sha256:4673866f8b14d746da57d29b5f17bd8df16655994b7208614f9f69e04f0f1b2c +size 618948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 64afdaa156..4fb1518bfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a41644839bfe74da242653818380a199b4c6970b6e3e73c7a91409762b939c16 -size 817917 +oid sha256:9a663b36c602e8ab19b653a22d6e3afe6723c5fd121e2e35359e9677bc8af2fa +size 786870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f11bbe0826..1c77b10f6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dece06b4fd55ac98a8217b8f1d7892de3a15793be2fb65381e5c8dfc0f83be5 -size 719489 +oid sha256:d19c4d3b005e5d5f11f226718b29c5f461d19a6602a740980bdf9cedcc2cb8fc +size 696434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fdd601820b..eaed2b5f37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b69aaeb3199062b50d7c751b9af4ccf52b02d0524f4736fb73ff45ae267a0d4 -size 798431 +oid sha256:4bd148112f23536ddb7e7a1d8a89052948f6558e5e7252ce0e6f0922f463ca29 +size 773402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 42f40c6952..53973ac95e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae57a7dd65e597bccd5b9907ea0a33b0734143435206a8a1de2c2ac63ea90053 -size 700891 +oid sha256:0f2b3da1bfa6edde871215aa2e192b7f142ef9f12e46cb22bf2ddd80406724d1 +size 682966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e8392f6c8d..82a9c93708 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f7bc1991d079e2ba0a35e544c725a541f57f21b5d515adbad6dc28789d24839 -size 668153 +oid sha256:24a7091db77eb5e31c73b4a2d69567396d29fa09e393f4ccf798f81d8d6988c7 +size 661526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6d19dcf3ef..540a79c5b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6347ff294c8c0126af4642fcb05bffa534b603d06802c3a84442fd2581a8c3dc -size 559808 +oid sha256:401ae730464e559566d7e1293cae1fcb86401def163c684c19604e16a7cbce66 +size 533645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9ebe35abdf..a145db1225 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aca5773456ca47314ae5104a976b8b90fe053e1aa051a89ea1d669ef43bbb69 -size 672197 +oid sha256:db6eb1e6414d25e074c229ec3d909b87ec6b7f3852bda2e33a1f3d86b80e137e +size 667246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 74d1b7be9d..f9e26708c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7666b16f01866d904a467c842428ba8a37b4232cfc93420e81e983ccd558b896 -size 564048 +oid sha256:6621b658c734e7bc2d6dd2d08c3a2a3c75c4ec3d19bc8b9bc0be0b4d945b321a +size 540501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a46fa686ab..52eecd2be1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aedda1778b5bcb2b2c4d9f87c818a681db5f2d78a004e767f4f6a5b211c13666 -size 734587 +oid sha256:e082ad0bed2d0443629ac15e94a3bd8c8d316fff0bc8de366c5f34087d478566 +size 729786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5f022a50e6..b81e3b14a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f71fc4bbd01e7b2e90cdfad00b9f2bd1f77a68b01e179bf9371cae1cdb0544a2 -size 627329 +oid sha256:26e9b221c17c36ba482188b745042e4d368ae24d76c55dc76965bd3daa6632d8 +size 602447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9d9d91c5a1..6bfa16c8a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f7856bf450530ef08caf92a7780f877645d88f936aa041b6815868bcb4af622 -size 756857 +oid sha256:a2fddf4822e148f29766f1101c3485c1a41f8250f24b6acbee34cccdc0987e93 +size 747862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5d3011748d..202f5b92f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dea8b537cb50df578771be16eb29f8ae8fcdf4224d7f100ae7ce9e22feb718d -size 645109 +oid sha256:b703016d5c33ad1b3060811e05572b76214b393a6f42b43aecd588482c1fec28 +size 616625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 66a7aed72a..31c6d3c5d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2740e7bb98aa2fcca35cbd91f966f77ebac76e84c5b67d44a966ffa005b5a89 -size 747089 +oid sha256:520b54a4a2e160e31db9ef38702b4d24b70f1b42e7a868eadd7aa1be49fdb501 +size 740708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 04483a76f8..4e529c1862 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:454e63ece3aeeeb3b3306b3fb42dc10eaa58ec193258e5ae41c58a6059859016 -size 635439 +oid sha256:cbe90d4660df5cc280b7fbfa55f78d56c10b11759c6a5a1d36b706e93959ca81 +size 609473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index aadbaa5ed0..0ff30129cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da3bd16df07bc1048ffa89bc7a85ea8005887bc31633f328c5017da28a6725f3 -size 759963 +oid sha256:5540984afb9b84b6b75e9aff989d8256878a628b4ec2e7070578d738da9e28f9 +size 751362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8910fff200..66063dad5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e449ef07b4b4ca01f808025d53f960c7d7f8859875188e3df7eb22ab817217ac -size 650483 +oid sha256:0d8897c068ab1232b526ba7a2888aac84002db731d2f839ec526259c5b0581cf +size 621312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a349ed336c..a0f14e50fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3013db7d029171c20736022cd0295df96f78f124ee1b22cd6c8a5d81e19daaee -size 750095 +oid sha256:fa1207828645149e64dbf96133eb62c8ec0c7e655096e51c194e112a0b34d315 +size 744110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6a5e21002d..d591fcbcba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ed50ede3a991f938826021d2007bc325bdee7028af359d3d4b965e0cce12770 -size 640815 +oid sha256:bd06155627d910ae97c0c3d2e710f8d4110d528be66ab8178b1b5a355234d0d9 +size 614157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d965a4b03f..7675b082aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0711727c57e8120aec6869a0ba3f8e39dc9c449529e4ec6c4983e5638fe75c8e -size 827139 +oid sha256:fd47b601358482ebece318c43faf77f41288e9db23c5d0a672bcaf5fb1a41275 +size 817650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c1dbc7ef8c..ca60a7178b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e1ae74d6986be8b936a0888db905240925903ea06187b7773fbb3376663fcb5 -size 718893 +oid sha256:3df394f5cc3a2cecafc6ec877249e52e79814706b65426470e71c8500f7dee76 +size 685726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d1e23df742..bb59294b17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a02a01c81b088c74ad2813cc154a7287d6ddc57c07b901ac0f834bde38805007 -size 817421 +oid sha256:d487e6e398ed81ddad385e03a4fdc7c9071812ce95754a4ffd253f2dd7893000 +size 811336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ba918e1a53..cec0a0ad6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ac10034b99ed2011c3d411e636107e314ecdb510bde0d152f53f0be1a64c42e -size 709273 +oid sha256:52ef9e8da9086839647e16e7d3388dfccf53884af59faf8dd68f4e1bad37fac7 +size 678572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d70d151cae..785d90ba32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d66f7f77cc1329c12e94180b316aa70c835097432720873eeccd74eb29afbc2 -size 682563 +oid sha256:cdbaefc631afaa191e3dd7820da4bf2f167b82994324483c32505d70cd20117b +size 642488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9082ac2a5f..b1e15b223f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd47e9e3b611420d51e340fe95494640508afe0975fbd2dbbbfe57a012b6bdba -size 595924 +oid sha256:d565ad615a7f55192cf586c7a7cf5f178e88dd855be15ea80e6b18ed89dfdf31 +size 561079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 3ae1bafbf8..fb9c246738 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aa91c89a7255936eafb05eb31ac14b2e7332500ce77d992dd5866595ed4f2c8 -size 688233 +oid sha256:4110fd47c063b25d4558072440c0008ba4fc257cc7d831b1d9c8f75caba23025 +size 648158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6deb03a2c6..9ba0ad9375 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:420b584b09edfee2531dff86a0a0e1b6a98a9c31501d508d37061f5f88b03146 -size 600904 +oid sha256:89971955e748e7914f7a4b42dbc6917e7e6ca4d5558bf7527348cd58a7d0c77c +size 567835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3c1f12e5a4..73ffd4f1dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d6e821a6dbe8ecb18dd9cc9f96e5d150e0f566449841a058d11cb822edf2002 -size 752895 +oid sha256:c273c2ad792001a547f37f00855bec4974d46cd8f15242aeb3d7a8e6c02bb7a9 +size 709958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 01cfc0137e..d24279df93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d054010c6e53571ccb7509cad288b2cad34b51de6498b77bc7e5bd33f0be7241 -size 664185 +oid sha256:1a483325f9cc5b436e2d9c46081851c269f6feca3aed20e81171fc825e996e3b +size 629488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0332dbcd59..eaabe86dd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5b24d49d30a9b956f4fb6c36d69f6bd10290993e1d94986fd32c2bd65d54629 -size 778665 +oid sha256:f6b84a743700186065282e116af4c026806d96a030b3b72c05e43441a849b06e +size 729020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 52005339b0..43362f16c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e25806d7293fab08d2f994bd0617d0001f46ed409c4d955c70a9e36f4b6f7cc -size 681175 +oid sha256:e3dc05637aec35d7512a601e4cba1a2d7519dd498a669acc7e2cf646debc70fd +size 647168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7657f77fe3..6cdda87b0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ae203b027a0c9ed6fe5d340cc7e433be9ecb6e580af28672e8f1ec4bfbf6bf7 -size 768897 +oid sha256:6c39c3bf8babe3e830b004c15727ffee1fd4f970070d6ff33d4f267727d34f4a +size 721866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d9ff60e224..bc22ea7eb1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a7ea0c8d7d13db4739b755b61baa67ac3930a5e8f2664864cf2c7725f87ba70 -size 672345 +oid sha256:8c5a71c2f021bd973bb0bf8641d83942571a6011d313136376d80d18529c69eb +size 640064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1ef88655d0..01c5d0fc22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1707708e528f4839fad3fe88d495c5c318d0db7db2f7b52eecb6c90912765da0 -size 780045 +oid sha256:a0d6895a13671f777033b189c0e5ad3ff44e991285c03ab0c93a30f5d0a96c5c +size 732274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8848a3ab4e..4dd83366a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f34408e563df231b3115b9c3be986cc0fe65eef98b747030d85c24d880f64366 -size 686451 +oid sha256:8116c09b8d228dfc70b7beb92794b7e33dffaefe080082a3f7334938a8499645 +size 651606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d8dcf14259..f6f950338e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73850acc8aa4c55081ff7c6f421c3a0369b415fdbdc5c83b40babed266acc553 -size 771067 +oid sha256:e4dfae3537e82580118f57c31f38547eb372f1da3bfa3e79e2d3ed110d055bf9 +size 725120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2d55a18a50..6407149784 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb38ea59d4b8c9f07fa237f026722e3146c5e5520338167c3a659db6f0953f19 -size 676831 +oid sha256:99f6341cf72d13113742eec400da7b2574629f84e9f0d322a8f997647cf56d08 +size 644452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ac64abed97..3846e3e713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca8529b0876a6afbfb859372c40939670e7070e2b8d5850cda4a41927289c19d -size 848307 +oid sha256:faf23f0ca7d7fdd65e6a9264284f807ccaefa6bd81cc5ae15575c9daf028b12f +size 798020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9c818afa20..f2a514dca6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9fbaefa52163e84ac16e1ad4689088348ba1d8cc940fa5e7940dcba89d39c3e -size 754911 +oid sha256:60dcba92840ecfceaef1848b48b4ba56b967a82863d69d3d3b488c4d8804aa55 +size 715280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 513dc6fb34..1a11bd6130 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e92a1334bf4350911703ef4a13e86a58a4b8729a0feb390b224dd43c97dd1ab6 -size 838589 +oid sha256:46acce73e43b629716ef352cf9a34be9d2da4ca6b7afb9dbebba61d901166640 +size 790866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b21ca9b5cd..d2ca56c3b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebe3fef20e445a5558777a21024444bb58bd5d685c431570b2d33dd93124b6a8 -size 745291 +oid sha256:eedc91b609c7d9051cf3f566cfd1f4d6df793cacc1953dbb7610d1fdb36cd364 +size 708126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dd530ddd09..f921aa35a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df40e24d3814cb81ad8fa84049f00a278af632e778a8446ca34b698702729b9f -size 656071 +oid sha256:27182f6b3d8fb2fecdd81e74092b61763386b70ec2df6a9dc75312680d7bd98a +size 641550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4f6025a6f3..b608446994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:342305d01b6091e237d04b1fd4906825e9b5b0ac060a1199adb78725c4375aa7 -size 567754 +oid sha256:26c8a241ac75feb8aaded36b33d4a9439071409231a3169c41a17238f4de43b1 +size 550965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c6650de383..31058ad1a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9a9abd646f8403a2da380dd8a415b5cd0844e41d5af7f0fe6e9d345c04ad78d -size 659571 +oid sha256:eef5f3eb34722a39638f2bf1eeaedb8530923f016f96dd450e8e392dfad2c0b1 +size 641350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6c28f75aa1..86e21099a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:717388feb7d94faf67a5f9f919b4e6c9b4e7394a4eb928f512c5c359801719e5 -size 571058 +oid sha256:7e1674655d00f1c9daf40d33d7308d5eb3e60bb097b76979d2df6a330065ffbb +size 553479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f501df6d34..c0bd101d98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7198b526ceb7f373a54bb021fd5fc4199998fde9d47031bb2200dc729c09816 -size 729511 +oid sha256:54145c69b62550c34020ff9e509fe1f1dc199f392a9a1eedb9c67e9fa3304845 +size 705616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 400e1eeade..3b0c5f0fcb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28eea0f96173cd63bd6ee65b7b8363002837c5e17e2b6674e91da7fb1a035abb -size 634387 +oid sha256:cd0014449b7536887b0a9061fc918d786b0be8caba58f23acd7a755c08bce1a1 +size 617647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bd2fa93910..a0ae20b07e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebe87a002f81e908431dfb6da8339d4ca68a2d639105c67c16ed9f34aef5834c -size 760117 +oid sha256:59e1e538f74b5bfd249145e8275286c54e152a08186927607f5f9ab9ae602850 +size 729070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4090240db2..d13ec82a74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b0f8e8698272355625ec3eba4bd3d261ba8663d9ba5b1fea601e0f2b10288e4 -size 667757 +oid sha256:d0c8e72ee0d499615e3ef40c5ae733798c0bb236a3f9d1966b283077438f2f1d +size 639670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 86b450c79a..ba0279cf3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa4604c3ed12c57ee12294673830392595b313d7254eb20e65e14e4e04167c0a -size 739889 +oid sha256:52308b27f303194b440d1799032635cfd854ed516b2bbad6f079a7627498ad44 +size 715552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 03dd817faa..5760752225 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca3f516d0db1db8fa141ae38309e4e400494d8dbcc56eef2ad1082b51b667f67 -size 648417 +oid sha256:71a2ef44722ffbf33a9bbc68b3770766d536cedd3cbb22ce794e0c9607f9758e +size 626152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 74b69caa78..808ab8d78c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab9dc351821c3a373aa4cffe779137b92d78c1195e53621bf5590689bc1c3663 -size 762631 +oid sha256:c86316a8be2cf9125c20f2fe3ac1a4ff36e9c266ef4005550fd7f01c57ac59bd +size 729660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 41957ed8a0..bee12a74bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1b4ad2a91e1de9025f88b9f9919fa552553930b5637efd3e17825aeb25883d2 -size 671651 +oid sha256:b5847dba288c9cf983a79e87dc7bffec112a2a7adaef33af4d1a3602f3d0e980 +size 643564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f169ed4aa1..724591b453 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22568d7b80f73608bc38f86155e57bed87e87a3a46793899b8b8e6e3c84b85bf -size 743143 +oid sha256:c9bbaa10f2816d3c9237facae48b9bae89d441afb52834a0b66a7741812aa42e +size 716192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0bf7034c44..a8895194dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c456197738585cc014a04f13f11d336a96081926ee3c3be7aa215af778b96bf6 -size 653103 +oid sha256:82b37ec7ed63daffe6a9986d4d67c6a6872c5f617386f0983de4b77fec03ad31 +size 630836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6942ea0b81..8be600019c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20c220f4cb8846aa1bd132dc41b875e7914a473c95564606c13ed144e1fad433 -size 834593 +oid sha256:92fec75b05027ceeb2065178644c01218ff928887d30ae05f5bf235cc12df319 +size 801226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 79d7d15896..037bb8c65f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2f5481dfcd9f221369c8cd15f21621018c36986e654d718c4bfc948ff6b4ab1 -size 735079 +oid sha256:8439e5731a7dc0904dadb8c61b5e599f0d7b79dda644ce189115a4bfc4d7399a +size 708324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index dd594f0cc5..446d7087a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:677403b97822c6228df9328792c95d4a31f1b83e7f24ec4e7437240ae46a4df2 -size 815155 +oid sha256:79a6b81ac25e562737f8ac39da6811632c45bbfd066fcc30ee1f5edb165034d3 +size 787758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c4b41eb950..f13d8e3422 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8643c5effebb17fbab2f30c67803b8b04c9700b75cfb74e4983328e84d7b628 -size 716529 +oid sha256:064e57421d7f0c8842e4331e3a05bd5e07d43eb236d463f66f83e4f7c88d5a06 +size 694856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index efb7cb2707..71528d1d40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09f54b8ae77a102f96fb8eb27e86ca2c199fb8c5b88c88ffc98d8c12d3790c41 -size 639935 +oid sha256:21b1295cd3f08d26bdae3be640b4aaff904dd867752d2f6bc954862482d9ccb4 +size 606173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index de6d9a3bcd..fda962732b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:470a3b8bda68a921465960cbdc394cac6ed46f75d4018fcbcd00b1acf0951bbf -size 554036 +oid sha256:1b1bdca104e70cb4f785cb7f1160f927c9574085c21f1fe5b8dc4f3197c91e98 +size 523581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ed984a0118..01b83a0fef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e847a72b4cec7203ccb485f4c76794f77d408b75ee79e068b03307ff7825e06a -size 643139 +oid sha256:c7ac23d6584026cf6c5784e7dfa15dd07b626d5020adcd18649b8b1a31b83d7d +size 611845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 3802a760c4..f89cbabf12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87d8a2081c822625ee94e3a26a40e0a71ade527754339ccd3b69ccb677b3f013 -size 559016 +oid sha256:4dee0786a27b1cc79e9445ca37275c01baf3b04a187ce35465d0857d235c8ddf +size 530387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5a76e6d91f..b5250a25a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cccb4fb89a15dffe3156ede2ae36bc915769ae1af4f757afe385ee1aa574294 -size 706221 +oid sha256:7973382664dbf8ec0bb9896d22422f67ecf38d44f2bfb170fd948e9762b3f53a +size 674434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index aaa8c6d1c9..ad180eec61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d65a04c104fd281e126cd9c037758297f21d13f3dc03410910f0c5473ca7987d -size 622297 +oid sha256:29a993fe0f148324450eb8184a2871c3670e48994f828480b16560fd9d2a7398 +size 593123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ae9e94a5b7..c9b86420cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1eeeca97dfbc5707a60fc16929dd496a1d4f758726f729dcf7ffc158f6bc2021 -size 729525 +oid sha256:3fe39ffdd13aebd5dc60342f9845d7bc21bd6e6f1d86161d27d986a7d995e707 +size 692706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index dd63ab8aff..0f0c160d3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8740c3d4a00a632832a48802c2fba8ff9d2c9f4fc2505369b19d989dba99942f -size 639287 +oid sha256:dd5e69f68cca9e1c15fd7b72dca5065f052131dd4bee43f8114fa26ddc443b6b +size 607351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f52831a36b..e9b7baa70f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0da6d49f0fdc2407be8f1f5061b1d61186bcc767b1f4980456ff8eb005cc565 -size 720597 +oid sha256:bc31de2cd472f8bc403073d13b022c3735ae5b55f851a078ff10bd9301ff27a1 +size 685554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 20f5bcced5..c83c9e84e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07a596bc7fc978f3443e35a051ed7361e5ac68ecc1defa5d9c701ee599471ac6 -size 630457 +oid sha256:12b0d5cb4a0687e8a72c99c1db8ed99895c32516d975216cc79df1bfb268013e +size 600197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1bc1727dc6..0bbf0795d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1858843aefaafe3d287b76a0bde1f16f51939863684a509971caa8300353b790 -size 732483 +oid sha256:6cba1eb5d9da9f09daa5c844204fabed62743d28a524293e0be491e104718200 +size 695172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 610b30c41b..1b6f002649 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e55e258b177547e5f8551d8fa7028a7fa286e92537c15dd18aff3bf5ea810a2 -size 645451 +oid sha256:c274c217bdb881ffde95ba28ed1517f51614ccf98c2f4046f39618fad4b99276 +size 611197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 37ff71781b..5523bd6095 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c58022691b58dc27a19120a1b707675ee972d1383166aec029023b636373baf8 -size 722765 +oid sha256:cbcf9ea94b9275cb38734c9bad1133aa2b52f7d360a4ee804f44ccf33fcdfe60 +size 688018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index a42a7d5f92..614316c95a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2f4cc05a87797116fcb6483a67e88002031c9ee53f23f8f867fd3f4ecc1e425 -size 635043 +oid sha256:1d3c3374215bcdc3d2278adb3fba317e067495f7b86a6ae2f502208f5ae0e770 +size 604043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b44de552d8..888b3bd5c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fa05388a684ba433b8197086dc4ef3ff895ac5ffe1e3f7de060ed6097c27b7a -size 803065 +oid sha256:e8faac51ba3d771cbd7ef7462cd367e2be06bbdf1ef6462ce98b85b23980eecc +size 763286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c0113b7db6..970d81b3e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6351a25ef5699eae57d19400ead832ad90d6c6c9e01fd5badbcf5492657d713 -size 713023 +oid sha256:86112ecfde5f56e9106c06ada86c3f6808e63370e8a19efaab402578a7986f7c +size 676450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a2e376611b..0c9b8f82b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e824fce818105e7268c6e944faa160cb73762a0c793d921713d643a7d0a1174 -size 793297 +oid sha256:22fd968b4a3ba2ef577802bacf6450389db2cec1999e692c6c85c93b6d6ad483 +size 756132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a0c1abb56e..cf31469acb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:448c2788b4726aaba61b8422a794a89b1cb95583e14a1f2653c529882be01cc9 -size 703403 +oid sha256:68fdfc4eebc3d653aec79f0f9b1a637145cd95bd1f8b9aebcc8dff347293203b +size 669298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 026c73b251..326910cc2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c2cf5f904dfbc8f09408c26db36ecbee05fe7c11a18433b44a8d8ec4b314b2e -size 656711 +oid sha256:96b83cf39f8969a93ac3ad9407969b6302118dfad70f13a823763b748a4decc3 +size 620928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f69b3181bc..6e90207641 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0b36c3d46257140b21d8408e8aef1cd0ed9fdcd7ef969e76b2d98eb5d94dd99 -size 573378 +oid sha256:6d6b08e7adafc38449398e01956a749d0594a31e60ad7981babf5a66a8ef4f06 +size 540703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f4be62658a..40413a80f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:810f4a9661d275f5943a7027dc32df7c802bceedca7ec64f40941c2b6efcc054 -size 662087 +oid sha256:d9bf7e46d782c3b16bb7de9c44571e432e22a4a68efd83f915619c53f0f49f35 +size 626600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index e248e2f571..57c4657ba7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f33f254673a5528c0aeeb20effbb3686601888ad4e036bc824d89d85c834b826 -size 577570 +oid sha256:1a2b5a96565f32ecce05e99a67c9f6b7b0319735fe4020457a0095437a23922f +size 547509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 193ec7eaf7..abf4bcf178 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f87f56c8449624d6b2ad02e8c231792b57afbc1a879b504a022d248db64f6ca7 -size 726797 +oid sha256:681867f5ae3dda492235ec0077ca814bec2b5549b9c3dd6906c1a28291b84795 +size 688350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index c5e1c9bf38..41179fe6cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85546cad81a38087ee7b88f1a7fff6ce25258d714c9d4118f0929438db4c6b53 -size 641639 +oid sha256:e12d48cc6a660f0c81a38c6de0e7cf30c5c38567ecb575af9c2fdc255a6d483a +size 608667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 61e496d79a..be1740e1e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf5c4403f86387af93f768995e9da8b6b842cf1086abc5a9a37c9147d4a5dd2c -size 752569 +oid sha256:7aa058d7af394c3c3b2f319c4f8f310919091758f1f29fe8bf4354fff8a2f276 +size 707412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3174a08450..43bd0e0e11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e93f32ce255e0468e25298e2cc973ceb935ad6e96e545a7459caf0a664e04dcb -size 658629 +oid sha256:8226d31bd909966eb57c071433166305eec1133326a0dcc99f6db624423bd0ef +size 627682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5f2babc8ea..35ab16dfc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e2bef15048374e9f7aa50c29a9dfaa3494c95a315c6a05d58e2796d46fd6994 -size 742849 +oid sha256:514e0c627f96f15b64dfbcb7700ea352775685cda6494ab3d23ac8a206b59004 +size 700258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1600ea777a..1dbac5f701 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76b203cc83fed13598bc67487857838cc9ca2a8c4fdfd9b8b16ded09fdebf0af -size 649799 +oid sha256:9670325ffb7f02525244a276a8862ac4c0aa213d3ed5da110891056276a647eb +size 620528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ec2f2bbe01..38c901ab52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32d2ee209300a85faf1b33f5f949531b5957469d50f540afcb33473de62ce9f4 -size 753947 +oid sha256:d1edb6c26abd1d4aa75d9270f82f5ffda91676a715664db21835e87930df56be +size 710666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 38e2c36796..699878d76d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:337e2dc44db6aff4022beb7f1d3d0f66a96f60251650d9a8f9a683fbe304972b -size 664795 +oid sha256:c9c74183e9a733bc4b02d7924bb42cfd8b0675b5e72f2d2b820552627ee94b9b +size 632120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 915974028a..9988389309 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:921505e65c43697569596e9b04eec681545c5c4b92bfb897c6e07f92c6f4fcd2 -size 744229 +oid sha256:c61e979aa04a3266d222606fae5b56d4d9561f553b3b0084ca3b6d27d26c0c42 +size 703512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 557adcfdee..435c2116d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c08855e161e5e600fdc91381e9ec63d117304213513efa1590e231d70155e3ef -size 653497 +oid sha256:efb4bb85cfa8f1bab17b6e94d76db940fe2248bc5bce7f9d76507970d110c929 +size 624966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 380fa39d3c..3b1b4479db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bde89e0e095fc31092377701ce2ae77d4ff5d5a6968174dbd80320e064ed58c -size 822209 +oid sha256:4fb4260370c5efafc8ec3044914db87911625e701f8ec1a521508375c7cdddb8 +size 776412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1732c7a9ac..125f6fbffb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2057c942bea26d0721033b47572e855335f435d255c8e1bb40991eae45034449 -size 732415 +oid sha256:e49676bbc1017d90e518a407b69fffdce2c164e74dc91df1858e44425c5761d9 +size 692636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 08b1416d44..f32867e419 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04e7988a389a6345defbf42a89c417d05247d71de1cc28c7da6706647fa18e9c -size 812491 +oid sha256:26d38b5a963b9b14e09bdf25edfd71704f0b45bd5d8fa04930c3634bc5ef7e25 +size 769308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1be6a9bf7d..91dc9a34a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aca3f8117fee6d3acca8e33117bdf6ddc495fb3df5578ee7951b0960841695cd -size 722745 +oid sha256:c0aff6f7985df10da110b67bbe4f31ed9410eca5768e9c071241a93e7951357c +size 685482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5819b4591e..964273e724 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1e76f4af339b1caa79643c25b3f43ab6b5d779012495624f0804d28a7625588 -size 633475 +oid sha256:e61755fe2fa7945e0ddbc80ce998352992f64dba890bb6d5734162349a8311c3 +size 618364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index bbe7c11393..d8828441d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bb23654c9ab3b827c9c6ba5ab9f61bd98c5f13da9b7c6a0e408cd619e78b6b8 -size 548514 +oid sha256:3eacd391a3de633a8b1a82391bbdd6fb8d988d089bf32ec408f8bff9ff9f5f18 +size 530935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8b64bbdfd3..0b8cc3936e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75328d16f046f780d44ac1301cef4ba03a77b0eba280b8dd98394e62c11ea21a -size 637075 +oid sha256:780807da6d7684eb8f58db92a0a57d115230fafeaf1bce81f9dd3b6c48d66d99 +size 618164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6c30e5e4d2..34b0110311 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47936b621ac25c3f55cfaed18b4c2e5ee7ca95bf5be1285b294069e3f6b18dd1 -size 551720 +oid sha256:6f25fe742de9ad9c8435f59eb1e958b9849b0340f17ec192b89a1820e93d9447 +size 534189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 785837c0f3..74d4e8e726 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:997c61d48e63cf118ad5e7301e11dc2ca7611b38d5d5fc9416a1b7fa19f920e1 -size 707015 +oid sha256:df5092a1f61b54961247e1d5e0e1c9fcc4a4fda29f39de695a5abdfb8ee52fb4 +size 683220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index eb38df1a62..a11c931797 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0d63f020b7e9bc6d82c80479183ae68e516e4aebe6ff759074fe0bd6ea77f3e -size 614406 +oid sha256:f994aee09586de6cb4fa665eab33fa0a844cfa3c885add171bf3e9359b1ea137 +size 597617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5a108671b2..89febba2be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d2d1862e557ec4037df9664eb1365e87bc626dfc8e6054312cc90e2a60c2751 -size 732095 +oid sha256:f373540e74f46079a47e8aefedbce0e8d527787b82dfb40e77b61e843e9ef1cd +size 706622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 80ce6102d8..15a98bbf3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b466d835380260009bc081423239b587154c7cc81d91cc6d6c4956c5fc9f1190 -size 647777 +oid sha256:19c101feaad5417a8f2f357ed940dca991e3218290b70b120d6287a9ac0fc9cf +size 620478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9b0a4db6aa..d665891a61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6570bba626b1ff6061a5f39476fcdd973596b07795d411afac831f160047e294 -size 712657 +oid sha256:6ceb7d58307f4f2810c7775d94396992cc812f6c41a91da541f6e9889ece8d4a +size 693154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 77345251dd..42f4c03387 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41331d3c3aca30f345c20ccde039e3f42a21d02b579458c226a4cc2b505761de -size 629177 +oid sha256:febfb5aca229daee83e6cb032ce2e16b39a03fc6a5d6405609eff25ea3fd30f3 +size 606961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e987516a92..3fba389a40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6f42bba05b90d9cc09f49a98ba5d2ce72d65e343cdf00ec12e905697d6c152c -size 739987 +oid sha256:da5eb58926cf71740fb2567200f5475e92c88e1d37974e50a149c4b5dd941ad3 +size 707262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d775954a3b..6426acf1e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aca85a24a81019a625ad790290a5b5bdaac2d2ca4d099fe1c2678af1fc50288a -size 652363 +oid sha256:c604c975e5f3ec2e6e45f5676d2d8d555035672ec1b0872396592e133cf044d1 +size 624374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5f89f2f1d2..55545108d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c342c234ee2ecb5958b867972ec9bab3b97ebe9949be9aede159235650d62e7 -size 720549 +oid sha256:3c93afde5564a44a898cda6610c903666dcb1e511cf035045941f8e666a44e5c +size 693744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 15f3accf5a..2a86310e7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:287ed54b9a2af69e18833d4a76389ca0c48aa6a99e167d5a150a9f821d1eaf09 -size 632975 +oid sha256:be62558540826881961cfc0457f911e4c744254097e76d4aa5c0aa2676067165 +size 610905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b1b638f7a8..a797802b89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f54576c2241d8c10ef462f1650db88df98d844c41d7eaf54f325115b3e3c4c0 -size 811997 +oid sha256:b98c652f6397c93c637e69af757baf9343542f6254fb123accd31528607b9871 +size 778830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e864c79fae..ef3508097d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c88d4daa522b512e30c42a3578f39ca081054e5bccd3c6bbb0f16d4bba30ca3 -size 715889 +oid sha256:e7256af5089e08616e9c0ed2533b7395e946f5d0d6131f92644b5a5c80436984 +size 688394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 18be532ef1..cadf125cc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8ef25022e902a3777db6242a85a6f1f0a510894893b78b866557158559d912e -size 792511 +oid sha256:c75bef5c7427dd4480db29f97cf279870692269b509cf7616eacd7f36ef24772 +size 765312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 15f919e34c..5895e7a767 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0486c59bfaabcc358599195cb0856de6fc93797c5e9433c24414f14e98ec9154 -size 697289 +oid sha256:e754adec100f33207390fe6fa15917eac9f92bf288d19268178f7ca1f2b37a1e +size 674876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 497baf721f..f5b7d99d19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2678fcd2a795035a46d3b755e4507a5fbd22e0b26d50dad7eccdd0bbfc5b15c1 -size 645459 +oid sha256:dfffd5fd7d23713db8f52eacd739cab362b6cff9540f37f1f09a7c476510f176 +size 615151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e0ed25ab94..1142d39b5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7878921e11844c8cd399aae415d8eed7b4d0e9b63d3a11335f09470e9373d54 -size 559610 +oid sha256:1ebb5d608bf8b835f6947186e60363ce5b57ed36a43d6f90c7d170280039092c +size 532511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 899a3ac091..ea56e4ae4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:446cd09643e3999485e26998ac13d147f960bde086c867be4b666375dec20393 -size 648713 +oid sha256:595267777ca0b44af62b0f7174e4f8ed1db9110a30edc7a3856a1521ebc6ecdc +size 620774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 76a5914880..132c7a637c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b29c2342cf14fbfc8be4b8ca9dc868169a4143f2aab4356bc5940c22b5ae90dc -size 563802 +oid sha256:4f7791a8897fa7d87d0a2af83193ca7e045e012f58a8ab732e47a7646f0f728a +size 539365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b93cc5561d..cd87de63e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:734fe68ae3d658c139b19912738d5a8f6445c748d9239f85703167ccce274751 -size 711795 +oid sha256:84b0598f07f282af25ed0e535e4b64eae6404ff10897aff929ff4f6d2d487f6c +size 683364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 97d51076d6..e30fbb3d5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04136587327bcc1f8f0bc567fc816d523f9b52b914fa5c6c0c0d85f0d5efc16e -size 627083 +oid sha256:512dddef22aeb7c239c381fdf8e72eef17db15afb1780469b4b43ee25efd8cc0 +size 601313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9649db7905..a99174db54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:891c98f77931a3049b76450d8734bcfeefeb88322cc9c5d1dd08e2aa5e043f85 -size 735101 +oid sha256:3b00463036ee511687b977ca3e4b4467fd7feac47a65d66d73b83b4f14641a8f +size 701636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f1264a3244..d117a4e1d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43d34a90d6d590e045aa28bbb383efb1dbdf21f8cda040a72faa3d500035076f -size 644861 +oid sha256:6ad0b39716ca4ed2732df4c786cc12ebcc73dcb7aa0a4953033c41a142ceefe0 +size 615491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b602fec9bc..5a40b3c020 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e3eb3da576fefcf427dc01619c88ab4420d74e743e059512a3c3547f69a61da -size 725381 +oid sha256:bfedb5bb6770119d5cc63780c2e0c1d115881369bbbe513edf085d163a38ac0c +size 694482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 92a838c72b..2940349f0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:831386368119d2206a1f7eb165cdeb2126979fc682bc0f511165f7c28331db87 -size 635241 +oid sha256:dc5c2d49c81f3c586e9e00c37f36fd5a0ada767d1c8ee02f0b0ad50a0e72f6b1 +size 608337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 917eb231ad..f6ecc37d6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93ac6a8be4751f2e943582b20fa9565042f005087901364d4682a5a26ee8dce7 -size 738059 +oid sha256:995180f729789555c549a1d8d8413d3147acc0a5c35d7dfcc4647a63020efbc8 +size 704100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 53ada6cd42..84950db181 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2b14d15c1e482c81f59fc55565ab2c011a75c1119d2652634ba027651724f2a -size 649299 +oid sha256:8892136b056116d7e69ee43f9d37578b83f145152e6a7ac9e796d9798dff5dcf +size 619388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3f6a7d310b..f2771a8713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f6bd9d1464e821dc335cba2176f12a846faae3d627dc24d58a429e8e145f9fd -size 728339 +oid sha256:23531c50c986636ddd0f4420f155818a1bf44eb2326811161a9bd337889e376d +size 696948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1741aa7440..843f0a94c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ef0052e6e9114c66ed2702294a6d808f3698f2294a81f4763f4395f01a54af7 -size 638743 +oid sha256:7a279f94b71f9cd4c83b42d987884a25e1f4c281747750e0546fe7c753f2e1b7 +size 612233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index db0f9381e0..d7a4da7e1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5456f0aadcd1532a3454adc04fe1219725d4bab1a27e1cc111c8b6d7eccdc82 -size 807849 +oid sha256:90a432b4c3217dc70e862eeff6a4931a7eace6b6e2c548f8a710bea6c5b28f58 +size 771426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e8e12bcb3f..66d0569133 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69788206a57adf87c10f060cded38202a067d8236784f46b6c663e751b0f7872 -size 716871 +oid sha256:4897c1e6ca3e7bbc40eb03c926ee910f6680ec59c5d710a156c1a8dcd7e64234 +size 686022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3a6dc6f852..0fb416291d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecea6bb5fc5dfb27549b0f4ed095ae494e17bfc9cc90870160fab44edb98a486 -size 798871 +oid sha256:dcd33f3b213ac1394179e1f8ea9d18acce2bfa2e9011d7fdb5560c4574561d30 +size 764272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c641271aa2..628a8ca258 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13714364b70be2c42e8b9583e1e1fb303783633269ec8fc9782eb9de5652fae2 -size 708041 +oid sha256:3594a0ed7f8399919b7c4c9a96522782843627b35556d9807f48cccd3ff8d6ea +size 678818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3fb57c92ee..b7a633929b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af88438cb9ebd76cf0185664e81c2702ce9b04f89609531b624b56abdb23de5a -size 716455 +oid sha256:ed53bc8330d5a513ec456d3edc2480b995c20a85ca30d03666782b2132b6eee0 +size 686542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5864faeadf..dedf7b6c84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50771098d07856466002b800ff7562e0e9832802a8d9c32cb66ecdb00164f3a6 -size 631543 +oid sha256:95976d83fa686a61b7e0a3ebd0e5582a6cd2338c76167de834866443d1f6ce24 +size 600347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 42bd205cae..63f7c64601 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:638e834b4efd6d0443b447c153a93e49554c8cd2ee7b5cf13b1afda5a0d6edf5 -size 719807 +oid sha256:21b8d2c1767e4fcd39bb46699f401212dfb2c99f21398c0a61ec23162a9d8de6 +size 690782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 95ca7f7c92..7673bc9d41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e878a841123ac4e2eeb7861748deffd82a36497fed1da91a78bed475148895d4 -size 635735 +oid sha256:b01df4a7ce398c7036cbb4cbc1f75eaf5f5d32e6c17c147243567bdb7908aa9d +size 606019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 02dabde1df..ce366a5d2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9cdfc7286ee8256dd3d35a974d30fa81ca17ce558ba8e854a045f859335488a -size 782889 +oid sha256:51746feaca4f0391d42514fd042db1519817c3db58a376608e5ec3f01e7eaf17 +size 753568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 376fd90472..77f1e6924e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c195d7096b57353d5a261744b38df5a54231ac9093b87d07b34a73a582b9bfb1 -size 699803 +oid sha256:031270190f4bf30a1c7c2c88c83ce4d963a27943c649ba96334b36d5fe09269b +size 669052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 54fba0a329..0970ba4313 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37837008649d8e130ec943407b3053e4f2a9cbd3d3404b2190a6070c826d6245 -size 826963 +oid sha256:73e9df4c992b9b26e3242d2e827d2716b64f650e7978fa90f730ac313209bd27 +size 786790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 74f2b4dcc0..b08a52cd63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a59a42f52cb90b60bb70b1c7fa6a1233435f3101e01f63016ac16901ba87b807 -size 735689 +oid sha256:d321038affb22b36b1683d5a52143922da7de175f28f7800b599a90b6d395389 +size 694330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f9bf22a3b1..3300d71135 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6aa99922c4deb8392477471cb42a22f33f1d3392dd302e6f8bdb2f2aa68c6275 -size 802247 +oid sha256:44e5c1428a4dac8edcade24be7271bcd3d307b8a9d560a6d3ab03a791f68591b +size 771200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index fbb1739f3d..d91472e148 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d3963e738b883a8bb7920405f9311ae46da9c749ff9657b7d0bb6291cb3daf -size 712057 +oid sha256:dc5bfc53794bdd3126e5073d7b14cc9765c8c2cdfb6b3190ede4f2c3b23d3d34 +size 676866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 30c5fe80a2..2c5eb9500e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36d98b9ef9a40866a067270fe609bcf2d50715d7e8e5b83ce74f1f5f2e587b29 -size 829131 +oid sha256:cab2d4c6a00c12bf7ddfd8ba8968f9b0fee08f33f3504e452632596a7db18f04 +size 789748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3e6b76581f..782356bcbd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb40688be427e5970ff386f5688b011e025041187edc468c08aac9eab790b7a0 -size 740767 +oid sha256:074dd2f52169339372cd68d68675a8e4f91b08b72fc0a08d7d76daa3c499753a +size 701186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 043e8726c6..8df7021f71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ff3b0aa09585ed177385b95a321549d2ae7aa0b8b7c1747d52ee1cee8f5cfb7 -size 803725 +oid sha256:2c0aaa48373c833f12bbfea80b16fa75fdb7bafd7f4f06d18d19c16969d0e1c3 +size 772184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index a977eddf3d..2159a855ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a999e2a7347b7e33c80114729d2347e3530050f31cf31fedc4393d258f19d7de -size 715559 +oid sha256:a5b1a91b5fb0097594339ef71e62810ea7dd8f1d96e0754d8343d366f7b44e6c +size 683722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 402523a17f..0b01824cc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b011cb644da7ecccc7fcd8acaff7efef0d57e97a3e17ad03d17abd804f5b38f1 -size 896407 +oid sha256:81c16c2162adc58f2cabeabd84e91fc8732d9213bd845c4a42cab958a3ad28b0 +size 858206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0ae4e78167..2214d9f07e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae480ecadae3d4002b48066b8ac783fb08b57d3aa764f1d149190b77eea4ae9d -size 805281 +oid sha256:9c7661d96dc763b8114dceee31ae8b17a15bd56c229dc468dfe20c9fd1c5af4a +size 767376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 981d875423..b0501ae907 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d82ffe05bebea8a2fd3e19f4723c918572bad42f033f25bc7df7db2fa0724fda -size 871789 +oid sha256:13b3b9a1297d9448ac2155d79043a4cdb481f52f359d693afdb4fc60100ab936 +size 841926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ae82e463f0..580534d912 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acb61f85779375b9eab457705961b93174d93e3a248177a60f6f54a0135f68df -size 781649 +oid sha256:c5595a801495542c0d7df7437e273f4cf081c6b82925cd9fd97e24bbf6532dd6 +size 750800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bb0d26e04f..3cbe82f35f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0978a67bb4cd9f7d37650da90b8fec23316bd3e94a503e985035e375a993ce13 -size 668699 +oid sha256:6f96ca92b21e8429ebfa2962ff4d279569d4021c790388b3562a56adf67f1e99 +size 645348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a5c5ac5c94..f0d38fffa7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85bddb9a92ed308c6694b57e6e360235faf694d106c7580b691f82a28b1014d9 -size 565338 +oid sha256:0c49a4cede071ae228c608567f358d9ccd340d5f244ee0075f1ce1e8d95842f6 +size 548943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a8590004cc..0f4f333fa1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf2b75a0af788da776b1339edc7d6c48c6c4617db1fba9ed53e02a077d25b516 -size 666873 +oid sha256:2411b52af091eb3e6c1f40ec66028951e4978e9c3718a1f26c3d4e339ea41da6 +size 644558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 6251908a28..4c9966e398 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f418532bdcb77ecf3d0868a95292d17c6a5c077ce0802555f9b204f4f8b38b4 -size 584772 +oid sha256:ed67cd949f4d81cc72b01ad77bb48bd163606c612cabc06a139a2890fa1b8039 +size 563789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2d1628df6c..1653645bd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd165eabb8f8baf6e91b5bb029b72107845d320a9c2aefe5d62a18e0f6115ae9 -size 734443 +oid sha256:73bc3f4319d72197f9345b4b4672e4bfb1391b597ee75d337ad36daf27622d35 +size 714448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6286bb95a0..7fc37d158f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dae0d644e062796f5770bb5796f72c942ae7f8c9723613a0754530e3b43edad -size 631971 +oid sha256:ead111dd183a93f1686ade3200f9c234b04c10200c36cdc087baa869d70368f8 +size 617251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9116e2248c..c842e5d7c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5675e5cfcecfaf6fe6291f9b238c430db9ee4642d9b69b67c97ad831e2294943 -size 812607 +oid sha256:9b01879d034e604304100e586efd50b510e1689c03276594ab73a7840ac91b21 +size 762468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 293c6c15fa..b8c237deb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9290ededa6b475f53e5afd9935f19a65dbdb400e7369116cbeb0ac04b749216c -size 691437 +oid sha256:d984eecadcbeb956c568c97c989972e2e006fc905dab19804c33e2da4e51c94b +size 657528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8312846bf7..82bcaac6ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef9424811f8be84db2989fd5f5c04cdfadd7f450e45242c8ecb0b411f0cce92f -size 758341 +oid sha256:11897cdee3294dfa405ddabb696d32e24adfaf26ce00f5b406ce5e3e1ea36c0b +size 734694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3ad59629d8..5902d3773d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8eed19275da41266b4eff5bee241371485ac22befa38fdfe7a828725f96d541 -size 653893 +oid sha256:be33f9294d65cbfe284c5ce114b20edd19bf9b706ea9113473860e1f69288e60 +size 630542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 907d3ecfc1..78430839ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d03bf4e0e738b6a92473440007cbfc7c878d266a59e4cf6719f18cbab1c92e4 -size 814627 +oid sha256:154b95ace98837e412592205f81a046b2611f0c5782bd3efbdf87de0d7b9f7b3 +size 756596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 67cae9b00a..f20e703a12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29acc086cecf0e08ee7af2d61c8589d0103c59e3a1d5aa2a8cbdb39de1dff185 -size 720097 +oid sha256:caf5185da42c6f563847e54157b2a11b3a3dc74f99f79fff896c2c69e03613bf +size 674596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5f78de7a4b..8949cccb33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6881d879e258576cceb48839556c9d15cdad8d0d75caf85650294eb2f956e2ec -size 759523 +oid sha256:173dda06b3c1fe7e3d0c7d7508358abd00f2e88ba1883b5b32f290d1d976373d +size 728820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0fc7cc1daa..1f4e296605 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9cf5c7db1a1ae302eefb940ddfd643d9010b30898404f55636cb83d1918fb0a -size 675697 +oid sha256:21cee5c5225d10c00a59b78d6cf5e7dd09b4426b4580ce5809525365c6ccd20f +size 646820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 051b75b9bc..b062f4ea23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e963ccc625248424d3e8ec53b8e5a33c9ca179c9ea1cc29463dd4ea7bebd7f7b -size 883185 +oid sha256:37b02350df03a35d3dbee85714a9f22da813545b32819e888a18062de93b116f +size 827670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3f134cd0e5..c763d831d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e4d9f06c2d023725789f9ece0e2d04febd88dc5ec7c7f5d1b0a39fe705ca695 -size 759351 +oid sha256:d417519a14f8b572374cd709b4d1b85c5dbe3df7f9a1d14e412e3cb3833338f0 +size 724950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index da25760727..3524c79002 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:104179fbfce12f3e7ef1dde767aef2cf550fcc92bb98bc2ff92b07c5a94e1f97 -size 829955 +oid sha256:60e783f3441c54361c0495461f85a1a9dc4b840856ecc87d6d0ecd26aebadf4f +size 799894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 626eb89983..64de2681d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b95fe40ad2f6502dc02626c67d15ecfe3826aae645b9efd411ff75a8717b2041 -size 721857 +oid sha256:eee20a2bd8af1bb3dffc39d171e976c1f0c4874def6ccf4d360268d74b5f043f +size 697964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 12ebed830a..6bda34dd9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e34ff935ee2c8d263244a53688926d98a1f350a6cb9df2390b2b8cf34763287 -size 677823 +oid sha256:bf8f0e341a8e7decef3309c9aed9bcfb27d367830558a34b6dc8457c6de3a368 +size 648156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 09a001c9b6..5fbe62485d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:459ae585cfbcbc22da3caa50565ea7d5e863433be5b424fa443efd1d59c0f806 -size 588766 +oid sha256:6dd21603dbe1c2812715dbdc7c4f68e981cf873243d382ff21b59b082dcaf411 +size 562949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8ae3c71832..eb38973214 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40029c144ae6c2e0bc531e678b553bbc0bbe07d36999be46f929053655fdd002 -size 679745 +oid sha256:a6cdd3a35da676a732c715dfa1a777750fd865cd201e49f2ca28e4ace438c170 +size 648154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index abd95c6c7c..9dbccb428c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dfcf5f82347105d57a19dd1e0c5fca8d700b8f173885d25a3c22d6f71b2862f -size 593796 +oid sha256:6e4e05297e6a7db0352fffd25d06b43887ce6c32d6addce0307d559a28dfc8aa +size 569557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7e3403b7f9..c790c01864 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c201887190c0c4e080eb9a78a3086c889eee59e01592fd6d353231919e6e5274 -size 744997 +oid sha256:c52f7dbea6546b5c66e363f7a5571caec584975cb4eab60e3d022ce06f070afa +size 715184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7c9120c1b2..a6db2a49eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e12020f48a69fd1836368fc7c9fed32906952068f528e208aa3de409ef02ff4a -size 657521 +oid sha256:3fd8e098b76e275ddef826f7420bd0622c3422188f4f5b80f8f323a8baf110ee +size 631900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ff4d9cc0a5..04225912a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b928201836198954184d583af4ff82a2f5094947f5a46fc7de684c66953027d -size 769437 +oid sha256:04b4b4930d312b3b565f6cc73d327dd4b3b48784b1e31ecfa363a41531e35574 +size 735972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8b9b857a0e..0c2dd88ef9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9779f2e3abd4d936b8c2d4884f989a20884c958861436df07eacd0364f0b7015 -size 677521 +oid sha256:17a6c0cfb7c742d88b6c33947bede79b2a463c06f8bc28d14f1f969b91231a02 +size 642922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d97758c704..5454e1b284 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c5de3213fbade54fc17629ea728f54b4f00df5c9e19ec5b919c9b90e349b7ce -size 759717 +oid sha256:f96c1720024b0b55d329ae66e1f0adcbe3e79a0c13ecae3e57da6abd4a5aac72 +size 728818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b2ee8d2d03..a14b1da639 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9841ff194b85ad6129e6309381ec2da7c03d812b31683b7ad05d05344cb3cd09 -size 668641 +oid sha256:66c7ff8f0040db220463778c28bfe866a310b8411fa49be4206996b826e859c2 +size 635818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 97bfd58224..28416a0709 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b98911adfdc00095963f28286837c4d8220f67f2c3c3b4428c839eb9d5aeb01c -size 769779 +oid sha256:01f18ffec48bd90b6818e57ae26ec7ebffdba8d03ec590c5a1e2870216ff54e5 +size 731776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 434e9a8474..bc71f17867 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6d8727377494104e23da1099d1e57bf03dbde94ad4302ea346ad23fad5b3da6 -size 681711 +oid sha256:d41e4fc27d22b3806df976f92c7ccbccc17561208207a38d2ca9f161da263482 +size 650664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b0c92966c3..4347930a02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d186c1283fc78f4085a0764a7a02e44f9b23e5d71cd6086ac33571333949d192 -size 760703 +oid sha256:be0ee5ae31580cd106ae40e8d2a7575ce9e9b817dc9431bc5ada869b8003e450 +size 724722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4f166c6ed6..34dda962ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fdec581e9fcbe8ce80fa1d3d2a5bf27de5e5e21ebcce048cff0f191629b793f -size 672091 +oid sha256:4cc2fd37c5941aecf18473526a81bdc3917504e48769cca611a2198d3add3aad +size 643512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0baee47e53..acf156cc97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:229e0c3b2844279b777aed86ca3b37cdd94167f62a64b72001ed306570ef43d2 -size 838485 +oid sha256:047f86d011da027d9f94e618c304046f94e57d0f3435619d161014879eba09b6 +size 804922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 50002eda28..f29d72f6ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13a5adbffe05aa710cb8b5be0bf38276fade632a8c0be981bf3d9077fed81dc0 -size 743067 +oid sha256:64e0d177ef2b50ce17b7c38276f1999a7180414a406423b6b55432ee725ae01c +size 715078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 80765405c5..0a340ff9c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:720957d2123dd49febcf6851965497e034df39ad8676b57a15c630625cd78a71 -size 828717 +oid sha256:5cdef6dc99bfbd774c9483360da280d8c04744f3e926e076b789459fec995400 +size 797770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 66e37f6715..29346b0b74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd1fba515b827528a8abca5730e85a9386bad198a6bfe7ea6ea39c95d71f8e75 -size 734187 +oid sha256:d521b90b65be19f35c2617d9bdb5e2176c8125536d30ac91cebc784b2cc8efb9 +size 707926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index e47ea6c668..d3c9ed5278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -19,12 +19,3907 @@ #include "../kernelParams.h" #include "tensorrt_llm/common/config.h" -#include - TRTLLM_NAMESPACE_BEGIN namespace kernels { +// clang-format off + +#define TLLM_GEN_VERSION "3df3fb2c-dirty" +#ifndef EXCLUDE_SM_100 +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +#endif // EXCLUDE_SM_100 + +#ifndef EXCLUDE_SM_100 +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +#endif // EXCLUDE_SM_100 + struct TllmGenFmhaKernelMetaInfo { @@ -39,9 +3934,9 @@ struct TllmGenFmhaKernelMetaInfo int mHeadDimQk; int mHeadDimV; int mSM; - unsigned char const* mCubin; + const unsigned char* mCubin; unsigned int mCubinSize; - char const* mFuncName; + const char* mFuncName; int mSharedMemBytes; int mThreadsPerCTA; int mQkvLayout; @@ -56,11 +3951,1957 @@ struct TllmGenFmhaKernelMetaInfo bool m2CtaMma; bool mSparseMla; bool mSkipsSoftmaxWhenPossible; - char const* sha256; + const char* sha256; }; -extern TllmGenFmhaKernelMetaInfo const sTllmGenFmhaKernelMetaInfos[]; -extern size_t const sTllmGenFmhaKernelMetaInfosSize; +static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { +#ifndef EXCLUDE_SM_100 +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ace47ecb2ca9a17a779bb5a7a3dbff260fc38a7b407b0b206de843b086e7f5fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e584070eda6975bc94b0a626cf309e0facb79483607d13efe22bc7983094d578"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dad06e31dc4d4631d9d6e38231329372a3ea824a0efac1d19bd11167f9195629"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3f68b2fcea8e97c1b834aa162ac1d64cf0302f4ad93f14983feed886cb0bd495"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ea6922f89d7807e0a620c77e5529db4a8eb83cdba3e7d55d2f99c2f0ce86ee26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7c1a57e824c5d01340aa32dedf85ba19176a69bfba57db14c46849d5a46d4128"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3836eb0297c4d56a706af188148dd08ab453f833cb347f9b42790462da679614"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cfd75562618f8cd8fc72f8aedbbf370c76a99cb650afaf544917b152e91884d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3123e30df4c37b699bb9f4b91a636c1a1b8d8441bfc12a5898650e72684c4a2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "2286b7caad97980fa0d927105c2d61c26953bc9a2a9008f6c0999d69c0c79752"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0eb289d62998d5b44e4afc4f89d346b64f320ebfdab4a2ee0564ef0b0e5d81a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "d39c9d0fba281078a8e3b4e812d9a02024274dbb10de03c81bfd0bf55b57d405"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "68c664fa4e7f16b6e01430ad191adc8efb3d659149f00a4c0c4474a6d3681f96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c7c90dcc0a13cab6d4d0c268ade92f7a93a24764c1459edce0986f7e1647cf1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dc58d71423e61e2f358f2cb00b29165949ea0249497a05953859d5be2dc742e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3afb3c7b1b90a3714a65c867028212431444c9e57913f8246aac5efcd6473f1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "57f26e79140a4b745a02dbfa35547f901559f5822960a2a35b2ceca1865636f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "57735a1da540b1be8c3123ecf72ece164670fc86d33591155c8bd6b223c4735b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "95e340b05dd9fdb85bf49d302473ae0fa164e227562c78e82b4b7e5466a4c271"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8d4f52432b7edf7547caf667c508993e7a10eff60ab1c4677ad8807f15436612"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1826eac23905d85b1215cee4916b8fe80f63ecf296c1452038202f000b0c0753"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3c454cdb5ea1da2a81274c5776a83c58fac097e8df47b0c70e5bd5aafa165e64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3105b04e19ec2cf802b2f9ac005f0e0ee5dad34c385e40cf9bbc3ea2e998d890"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2182f5454cba43c9f59272cb76aaad7960422b182bbe875c1eb4008b56ee39ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "f50065a04bb586ab951e900177b2a12b5bae2dd44b3c4488e31c1ee1afad8b01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "5345ef8aa5851ede84141bc59619ff45ee0cc2a148ad2a5a632cd25f5d9eab30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c94031c400acb07c986e640d46251fa2ad201ec8f779a1a48b97a0973f610019"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "60d8b511fe6c3ec9d743da4461dc18bd4af4daa2a30b2c424680444a0a316e1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b2e7831da3acccaee0d8e5e742fe683b2ab0ff5a6beab9e52965ecd5c998ab62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "014773498319250f0c3ddffb5a46571798d3617b005f988de2711715cac9cf8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e5c11447f1c1ad0bed331726ba74429ac82f6a1739d6162d8eb7ef5470791b11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7b19ebfde8ae2402af7b192c551eb2e96eff192dcaa24aeb50b357cf31360911"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "ac9ec794b2c72e1b9658bec89c5d90f6afe7e973c3408fead503eff7fecf78ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "959f88c63c1fefa3f39169b4bc6197af27aea71d562fc66f7af1c373b19da187"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "20de22b2529a35dc3368754eab120adfcf9c935e9c86673f7adfecbda647c8ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "921c759217ee4e341865aa17ecaff1f4a57e8e5b88ea9cf0bcb0fba52fcb8b91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "dec7b2a4b1aa538d3d7289adf175e2828654a1f5650839c69fd212ae78257450"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "27d2da0b6f2fcaf074d88b07caf3debaf354f8d66a52928f32d01db904bb545d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "878f7a07d115f249b6f1e95cf38788a0c398d3198c11f0233520e8a8f45584fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "fdcda529ab2d02c4cf8869d9271effa1c934a2f1a181a4649b8a55aae6c1a25f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "ca16b185a6a8f9a871e46eb45fc9eaca78814259aa93736c4c161c84837a15c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "590abac81560c16b5907c163d81e1634fc329f15f7e7d30c69c84e0ae22cc692"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "984188df005718917b6b5bbf484247217c36a4c3302bd5537c881241c44c1d4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5c898892a211f0747689210b0d081f4b4d4bd4e1efd9276c1150a46a71808fa5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "fafc783982a6748e637cb21cb35693b8ca771577c8db4c390b848b6c05968543"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "acc287911b8931787e2f5c331c0c6fa9c01332ae9d56bf5f3912dfbd2268dcc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3857bf4e06457f277d5755883e0c7642f92a12df50330fc363b9e8a435ba3925"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "302b85617cd9c8e7fe348ecb74bb249c6773d7a0db688dd52257fc788cef183e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "cfdf5234683f6ac381a4c568cb3ebaf1403da3629df1ed68dcdec5c663e1cda2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "bd25374541507897e2d6e29241f8a93820c750a68da5cb899673fcee7cb4c9a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "bdf78ca012f713ee894300f38b71c600daeec47d661bcca0e6a7383270fcebdc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "59881ff79218ee9287ce54225d9588b78cf741d5d1d7b3a8a7db9946b58d8cce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a059534af43af7e68584f0f6ae3b9610420394bde68bda6d0e0215b012f063a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a9aed607441f1d12cf986202ffb30796d5224f4909bf4e852aa7dc24d972dcca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0be75050c9ee7ebe7bbdf7ffe778c369c6ca3c860ac8cefb1cff9b3d9d09edc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "63bfb6fbaf969dc51e63fdec8a969fcfa85471f6bd945d44c16ec2896dec72cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "2e7a2c789e9b6046db46c2e6856ae075c7b87c47f8fa65ae36e71770ae43500c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "43fb5a24671fb5cc1549372495edb141e783af9e6b0a9712e9ff18d7cf36d23b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6320604dfece079050676bab0faf5e0407d479867626f3d910bc7acac8b29eb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "19e392e0f5df823ff14dc11dda9cd4109da4abdaa4852763d6150e2c0213f342"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8b4b0e42d50d8c6f7924fcb9587b4910f2189a4c2dccfae41423348f3e48a4e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "48ab49805afe2a1bbfe3943e073cdc779045ef2cc18803cde1789be547f24dc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "39f5692e719f6033d34c3e0231c7bd23e8ea8df46c6f6c9a290bfe18539bfe2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f8c3ac432cf4cda3404b4b8ec17051030a9d6c3eef205594412e1530981ff481"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "11681daf3463d238d6a2738c9615dc3f67b005cdda343593d0924bcf764731b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5120098868631d907dfc2dc3fa861ffba14cc239822ac340c140790b668112e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "68a5f0b8b2446545038f9be49ec05c2b6f4524838bc1203494964c08000582b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "39ac28228130fe82a0da6f1f10c3af964c5e49ba57dcea4d5e3f27bfee5c4447"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "edf3368be7f008740cdb654e35c15f0e0e30f97e832434728c48ca8cd20504b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "10d1d2eeb7a74dcad1355cd70e5d4d4bc35afcfaa6c0606de286cca55ca9d71e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4bcf5d94d6cfb7eef0b821e458e7888b17d576dea1bf46dfe5f6824fdce10561"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "84cdb7e30f833ccc5ac341933abe55512ff0632bcac3f27ff9312ba4de735f04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3379d2d269e8a4c04b53d12bdbe953b9c915820765a0419b0d5401f329473b33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "179244ff78b1f094588110d8bc18d45a9321d52c81dc1071ccbbb34519eaa59c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e08e481168c61c85eac3cbecf0fee0ecb56291355b8a01abee3c068a09e500d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "33d3e14d101a5b6dfd13797f8f137bacbb77d0d6e3a924d0349ba8ca76dfb5e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1d9e25d37dd1bac7ebf947d90e7bea64070f76d89aeb8c76719a99eb4b9c046f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "76959a255e7399924186a32d5b034940f8680c30de22b662bce513c96763957b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fed40f31999e991c9fdabf65fbeb9e2a6aa0ab81ac11c9e1f09b87962c611fa7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "75196954ccc90746c3886fd0dfd6a5d37f1d2d470786505dc937d0c1420da19f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "68469637ec709ae338156f0e641153f5c7909866ee36cb99369bc3ec45e2e6c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0916b1f8bfd2522f35d03c612c39eb6c299e333bf062f550fd3a2bbe6b9644c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "73f9f6e8f161d145dc00356a2e28306f716efcaeca01aa515229ff80a3e8893e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bfe6eb3fcddafac013b01e14482b698564b70deeffdb2aef27cd3af34f296d08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "c8e4cfa3ebfd14fcf0dc4e4ccef9f73a715376af54cffef9aa3f2f5a7ec5f4c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5a0458f738d84df7a2e743b8d4a9d06bb08e5bf8cf24d5eb59500127a91f9e5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "0dd138e0269531766025284a91e6121be469473e1e46c2802cf23b0d72219faf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "8131c156ec5883bce5301f3f37cebeb723954b5b974ae0c5de9fcabd06c72da6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "75288de877a64ebe157c8381cefa522bf4b061ad925f665b31cea9b0220c277a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "73ca3aaa55fdf2e60a6a50c83a95b94b7b0ce944fe390453eb736f211a1e3013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d3185f6d35edde458543b0767c51ac4295d88bcef297d1e6bb6d110af757f374"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3a6b66f43232f8f9375e18f846858faad6a8bde3eff9326af2082738cc2f12ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b9338fd5817975fe6d9110467119d4381c71b3e59b9520e00c3b531b4e52efea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "03630bb5b3446fc4ddfd5942148627f0a3b69d85083dd6a73627ce4e42766d9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "24711481c5542856e9de94a2efbe8df3378dac68227e54e9327d5c60a286b360"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f246570a2cf5a99a3452c56b796012da12b3b115f38d82b2ee1494b5080c52c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "525472bf77c115f454573e586fa6f48741c37cc6c50f9ddd8a3917820dd418d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1c6831cbf5b45894fcbc3811ce0709ecc7d8e6605e5f40e51c5d7398d62344c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b7529881f21c17d4b74b1550895d8c167cd7453e87ab794c9f2b6589c6cb8a6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8b45a29286aa078a266c95717393676571b662c7694219b6b57b139a8259db1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "228fd85501543ff8c5acf2dc025a89d6d165d6955e4c79826c02850cc2028e12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1d5ed87d363b5d1d42d7b084dee467a51801b7dfe42196234ba28cd5abcb577d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "566c5f02943ad40059ecc52b1ad65ff10dc13d2071637efc85542275c2bd7e03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fc2abe9273c527588bee13ff1f13ab343d276a052f7d23781a84136029649a53"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "56e37239bef9cb07840705e714e3ea41207af66f70523d05c4c6cd086cc855eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ce87c9a51201055a61e390b76d4e4d03db3b75723d301f299f0788966083d581"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "35c523af27dd5e6cd3902314b86bb0fdbc65f6dff6eeaaf054906a857bfc2d8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8ae56d2a5ecf626d07654550a1d7a9830e87bb18722f62c9fc03ec4d201848e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9e301bdfeeabf13a6d760aafbdd97dc8da8a5e9efce43fd2beb63164896ee8a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7fd11dced2b71cbed0ce155b6f5b64ba9b8e190f0b2fc4d6d3df9a5f03f8e9f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "75ef4bb56bda343e4223dfd988ce3ae411f640da34abe09376c96536cbad7f03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "09513cd796e20ada144ae5e5e31c5eb0f166af84388d3bbbe36efc7564708fd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "77ea66205ac6ce0fa33ea4433bee724e6604f42ac324c2156211b3c3e8bff710"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "902aa3f6852456411c3b396042810941d6734ecbab83d093d35f233a1590ba8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "1e45fd01c74f38bcbbea09ad83adc07aed81d99b7ac744fe34aeab5153729981"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c33b36a31addb975f51c64c7096ec1611a3534757b5af04b62961288ea1f8391"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "acf219e9bd29fdec82f6d4b841e92cffbea0616f48e1bba18ca657b89e55ae3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "dd3524db601f1249ec9db5da04a9f0d612330588fc9edb4e28659b4ba9245cfd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "38fc5f21ea52854fffd3f40411dbf0a2373c2f374be5af9fc2f3b0a6eb271d8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "9100399205ba4726939406d62d87812b09fe172cbcc4a8fc648a6bbda4f175ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "37c0dd3d4c66f4d04f8e935e00c583239448a7a32b28321e96f164018ccc435e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "9a437554df199dbd87f35a522618debce4c560dba6d248fc7d428530a8e56a6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "614e971b64df9c0fef42e2c4d1274e47439cea54735d151050850407a3ac4316"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ebb6ea8f529d80e8629878084e6954d688d98200015941fd187fa52eb0e463f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "61e56022c8b148e673cc8a29b44a71d31e9d61dc35bb6c68abbe1e55df5c4882"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "c378f18dd75236e70f26e19d59abf3d012037b20cd3233a42dc50d0f4ca321ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ebcaaba3e51b72c479c868c17947e7898797d06ffcb5e9a54d7fe341dcd5a2fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c6495cdfc0287345545476bb91838a1428e2216191d9ae10c515b1bfcc3dd5e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "be85d2a29872bae8ef6fd792263d42bf6717112490c3cad03a04067cf64945bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "a30947946c13834d6a4b62c32206fa1fbda58832f943aee16575f0e1f69616a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a87f3bb1f98f42f8969fe8cb403a5e937810b3a10f9b3c6df4dc9190bba34833"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "03aacc4834f473257257d381263b5beb97d3113ae7aece95998efc1cb7d44ae9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "523396dba5a70cfef3181d5253953baa89a25fdfadeb8e619705302e621678a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d430f3a3e2c3017f01222fb19ff08714c917a90759a98e7194ad2e81fe66bf0e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ec8924887625ddb6c2df1e64da8ece0d535f62016f96013e3b1fda557f548492"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7505896c609e845076319ec3bd8ed0d4352f2f3a0c5f83b61c6871a65a9f81e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c922f8225507115418c62caf1c6fd8ac0313a7eefe95c997d2efc287f151fd43"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d3cc5b747c16a6e5c337fd0594963f26920efab59156f99def1f878cf73b3f2c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0312ad1cfb1e5cb6a4155f60f0e315d263e6b6a6336a6b5747482860de632a92"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ac93dbf41776db3c802a751865fb7f1db40e1dcc63a0783230597d46879c4b44"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "573321ab7ce413b0a90b5bcabe85d26d2d0be8987cf35630ff08951ac433a610"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c0d8fdd880a00fd1c267cd57d0ad7cb05fb4100e281fb9a98e2e34614eb60fe8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1d80b8696caaa0dcd6414775e0f5aa96a880641a07c461164c741c52ff5a688b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "fb570d05f0263d4d86b4553fd131e772449ed29082597740d1c3319d6e66753a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "50a930cc4271eb48488ade8fe911560a83d1891a4cbc3d651db8d03f28eaa182"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b4f1de95286b197f7701244003ef728d3a55a37de0d88f1021ef73760946cb5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "91b51a1e136230e10c3a5a671059f9b541aeaaef6f0d7d5307ed219b98832a21"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "56b9897c0b7512f4dccf4197804d67750dee041d3446a652d3ae47cb76406eb8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c608bf60570612a8affff400156f9029bcd91fe66317dc071c8499045e60d21b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "002dc353d23bc58129677c78d4cabb5a1566e3b00aea33d9d8f91e2a42a25116"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "3b61e7c49f1810e32a5c870abfa5396d125a4a6c33a4ba5e41a081d59ca5f316"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1625f688f73df090796825cda6d4dae25e678b182c403a5117703c2aaf70e77e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bb59057c72bd1da919941e5ff9d14c1732a93edd92b7976cfc707caeb1f72658"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "694defedfb9184f59008cd8dc20c443a07d8c362e3dc85b282a40fe8a791418f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "32c5a2eede99864904096d457d3578cedb074eae88083675d1c5342dd1e7ac8c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "49a89bee3e80361bfb92eb5a737249f75933b8cff951d4838dd235ea3c9be0b8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6119791f09c4d7611b6cc18af31aede3e5eb36ce0376518e265ba6b59144e76f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3f5966cc68fa8c551f7771f250f17be1f6b8d2a9b6db3ddfbed0667a3982c5e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "82b285c68d180ac184ef3647cb83371b57397aadb6348d8964c7eeb7ecdeb04f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c14334dcbe164b2c131777142d364df28b436eec04d0ca8714d01e5a87a59652"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f62dc50d1c718583e30e411375af41e4ca5af2e0b737aa6324311472f9f8e643"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b6429a13201fb9861e31450b147ba9ba5a605acbd34f41c3a7914afde1076dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "22abd56ad281773981c8b01e836faec448f838d8bf77d8e396bbc40ae02bdc23"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f60ea56acb7b56958bc73683716d90dabbf2dd9f011d50cd417d4af4ead9c864"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "0c0e5adf1bdf99bb0944e8dad0a39067fdef3755dfa36936301abe29dcbd454d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "61771c02382c4b7516cfdd365cb5eadef661dfa8995403188459768950f59f05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "39940a957c3e3c34f8a576f74fb362ceed5bf752a2914a56b77040de8a445aee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "693efc6a9515d88b29f792bc8f340e921d1344c0c5972915ed214977e8c061f2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f1eceb7f7e9a0a9fc569a9855072553c08aa86b456b094ab0ab9f88e24f4b376"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b0853edea6beb53cb9df0ff863e567dd3eaee38581701b7774dc1468fa2cb25c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "5dad46278270cc407e646bb287daf7ee12931f73c5509aacb11c0bd79841433c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c408d26409aebeedce5b793725089299b478595e0a68f20f5271d22e67db366d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "20617902de821367f01ed4e290359dbf7adf29ff2c202f305c5fa3754e30358b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "48d332eba16886238e6b6caa0dcb530fded7b05fbc56fa8c76407621e753081d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8f9063479bb50dde9e1760c1704a30d5c5c380a77b6a54ed26ce1e4aa16abec4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f5873be7a8585e00bacf9861afded13cad0a53cb030e3c399fc29d766cd7fff1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "65e047a1305add5b7522dcfd9bdb7fe7b309d834d4ad2148481c6ac9305ac02c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "7d7340353e625c65cf1aa889ad020c4c3a4af401c9810aef27f1169fe42049b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "66bb29e2fdb5b183eb2c54a46e3768c7f94f58e62a753409812cf5576b2759bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "aa48faa6ea5fd5669849adb1f653e605cdfcdfeb1475f089d29ac0776b5b6650"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "cf6643d63dc78e3d954b18860b39a934c0b69fef3a4c2e7e21d75c8046e85518"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8887d4cf5501a60cc050ff781a47e1f7b3a1975a61df764523b2b5fc72afda5c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1d51d5b29867c3f002a9b7fe499ff470cda5afc772fadf227ca9583a5f301b3b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "403909a8efb9ba5534cb3b84ab3a01bfcb9cc60aa005fd69170db93d01b6430d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7c7b898c944ab054ccaae22a75c214fc6967f1be7c880784ecd112309283ad77"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "439f1643e5dfcdb2444f2faba4e75906a95ce6286a4c9c3a6aa1828ca2fcc60e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9bd25b08a1af3781d10810b866957f0c68315669838c1e83a2a1d736dc40d984"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2ed225bae0715dfc3f4d56f47c1180502bd7f00c300d0263c31738b7595b5fc7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "39e9387bd5329c02e900ccb09f2ab0c2a9995638bb359170d0197a1469bfeb6d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3a36fcacd533678f2b29a087ae7ad967b941ed93fe28b454178a267599ef7397"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "7c54b1acc33086ba326f8bf64374b6a5eac13baa11b823dbb6bfc8de173c4011"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a79918326e5006bffc19b33a7c597ddd343de33314b48ddc954cdaed6a67f711"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "2df18e153cffcd6042e03d4d75910f2974a91eee1498bde697bc5d6d5b4a8f1d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "851a76b490d3c104e355bc66497d34ef502a5883f9577726c5a9e181f8000bf0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3abaad2514fcfc68a3a783c7f38f6c1a73bc96b0a31fb5ce37771d4b86a18598"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a7f1d1b09d2cc42d54b2bf4cc5acd0a7d90c3d335a2967715808ea8e26401c1b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "691e563e2fabfff721a8d03d853e8d1006e28e0f53ab3201f81e1024c79e145f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f90da5e9f01c110b0a0dcad7c4cebc4f4b7eac855e258a6291276ad2bb2011d7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "238bc45d87c867e56f17995fc61ee4e35f490106f3a6349227b2af547a873764"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "75b40d7f4f099350ba70a895999afd9c30f187a80af9336afa7d24332d9b6528"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8e21ce77e8c904368adbefb667f3d9f6becf02e607920db0139f5a76d215fd14"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "da7ab9479c44415ed674ae6897b9200d823466e10730e0cdeeefb29296bb6bc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "73fc2f9fad102ea219014f6871b2d6644c10be283db4e8bc3e69f389b8cb18c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ff58483b63314dc7748dee9d03a842a3aa05c4d5435fe59208379221ad419721"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4b785d61c41311d3ca9061a2afb1ea09b4941466bbe1bf7f8f30b61c0ea4b2d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "84ab3b21a40e965416ca5e5ca04ca77f0bea3122b72b99d774b037e48feb424e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6097618d4bdcdfd8cf1298185f0a78d82030742ff67ea8d8d54d93effad18cb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "658deca2f32a934b248b309356abd455becc0db76a30aba62330b5f5d1d23d76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0996208d59864dc8884585e17407bf727d25fcd70136cedcdea5f8ad221678e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "406b122660af33861b9e1b0096f879ae16cba7291165b78a0951d552a4925bcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "41c390184ec3bb068c23d1b9445eaaed1bbb9f5f2c96071613d0b3487c07a480"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f5f9a6fbb453acd7eaa9fc6a9f68837ac5fbb803786561232b76c1b3365b9b83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fd50f4395fb861a37f6f5402a9d7adb04cb55017c7fc465762185c434e61e783"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "318441b107cdfdf3b2d88d3a1a9b8440a825b6c265bba3f7ba3bfce891b7ad72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "8993d784a9835626153df1b09ad3800bde83892c4da1e536f93eb4c28bb09d6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0c9d951576ac8a2f6b9f056771b54dc345f8f54680ca7edbfa31ce3733cdc92d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "4fc23937bb74837677607ccedbacc380b9da61493f60480eea13743c3b31996d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3c6cc9eb571cf695fef83dd055c3cd483cced795de0b4fa73ccb7e19998468a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b7419b89f0646028782dc6fe5fca42614e7361e97db7caaaa1482124f7e91154"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f627c3da047cf516f88c2dddb28de938fb3a386805d7edac3c4fda1029f19962"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "94e789b4cfc70c6af06b8ef852ee9bc675b3ade5056301ee09117a5ff347ad37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "8a1e11fbbf851cb4798c207a6a8d7c44e42738152f743c46e2d7ca2f3a4b1b7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fbbc7527805837ce1952df5bc8e0b1450365911a347ee13e1a49025d832134b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d3a42b0a3ba4f840c8dd5144b32e9c0f9f322ddf02f366cee96e366479b700e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "894630e02dcb884f0f677bd6649241d24a9eb387ed51ec9f804dc1ca0e43c7a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "89b576df2fbb764125083c0dcd67378b9e91d44b17d199c9d4b2045e335df6ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d2759cf767aa96a1d7ef088d4ce908d5d498c0902c51a5d75c56ab550cbeea52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "47070459012e723f17557d728ed075a46c7580173402bdba51f400d5a68cfbd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a88ef27a43f84c95a5fb3b388cf7929dd3e198c7b557dffacebb9557bb3c2c22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f09bfbb7c787edc640aceb3d567f67d9d1b52a59390cc67c0dbea2f59f777ca6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "87b2a95c29f6f320638dc07aa76d75e578a0b83cb1cbadbed82a59a5935ba4ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9ed40ab2b3a85424b9c03f3984786b671807dadec58b894594d20407c5fb22ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "03e9657e2c5e9bc1eaed8a7f0f55e58c15afe965da8c500c73d112d506fb8795"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d36df08c39aec71b04a5fc95ae2e7f0ab25013736c0cb979b362a7feb92e0408"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d8c6a71428df9b9f10af2a3ce62793ebf4ebf66b056ab3e9172b4f99abd7287e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "522442a487146f44ef75d4e4c28cd66d667268958f5eecd0f105f149a974973f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "dc0c497720d9bb2a53424f01db609994e737f94df6ecdb2b78a9205d90ae8016"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "81b5350fe5cfd4fc275ae09278746338a0b11753dd31f880130ff983ecf578d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "3e54ba3f0c1747847f5c1fc57fe95d868f682a8d9bc24da4dbfabaef9f32ef73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d6e3ca6fc4d43214da86c883c9eca40ccce72c7c343cebe2ec217c75ee6847f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2b2f751178da8e97700c9feeb504e041c33d5fc50e4ee51c6270e2fae5613a37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9d131836b8e7d4af676f80cf131dc33c0f58d782b793a8e31ee0ce051de9359a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "138d1127697e2632394151bed0a9bb50eaa93d3a97184da45c567768dbaa549d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5555fbb0ce331ca4658feec77ea4418af09f3ff4265092c20a7aa2d8d6eb8b4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e1f9a300d2709d1ca61c677e104ebe5ff9bc6052944fcee2df2d81a68464aa5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4d9d10020963b6ab95673d0eef4fb3f2563b222eeed3e29522308b77305c115d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a16ea23165d2c07500c3bc1cec989a37c6ff16e75042f4c2fa6e506828859cbb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "75425f220cbce74a896281ae3e34bd063b40aedcd52f6e7cf0a86ee5291b3c7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fcc63380fa5ac50ac4a8ec8e5a617f62bd1be462414fdc47ae82a93f076a6d69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4f59a5b4829f26cd48f88313665962db6cb3ca325eb3921de86e53dab2613f15"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "802463b96c975091446dc1c5644fd0051e8baa1daeeaf7c9d7273b1edadb4677"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "81d782dcf63abe1f0218d2c0c2a2539a33deb9176bd67f17b9e3446accc3147e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ff6cfd828cc458c05924ca4bff9f6a1a0fe7ff6d85e5a9c38562c97942d80328"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "099336e8dd8f6b722509698d5fbc958b484040657309b0437aa9e41e2479f193"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "999cde17dabdb99e926a16f216ef24b7c743709f20d4b9fe847df45040927fa6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c4c92d0ce5bd4953a80957748d960b5cb03849a9a017819194eecb5ffeccce9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7cb34890bf183e5805e05dcd565cc4eef249c59d679f612f167bc410f073e846"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cacbf24a9ef02bc0d73d638c68060195fe10f6b2e051f2934d2193f91119b251"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f303dc42ed134abcfaafdba6fcc394cdbeeb1b6f48c6d13a10a9d5181bf1e71a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6575c93dccf010a206055194130607afe4ab384ac9d6f0e517b8f37ce48b4830"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca1e3ba22032b40272f64c656eb1f60dac44429c4fa2830796bf361ebb603f39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "be6ff20f264a2edbe4f875fe040d734e9212aa9d2b8265f545239710a312c06f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fce605bd759df212a749664834ab4fa9e1a3a2cec4d8c93db402baefa40e18a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "91895118956eab80046de952f4ac84b721d82baf6775052ae21f21cf765fdc1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2e6b30f85bc02c68a258388361f4325b1e354ed648abceb7fb42c852e37fc624"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "92cb5514eb431361b1ea8ae4373d53e2dc2e521477de12ad35818e8357f75dd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b3d3c4d2a74711abb7d12a9c2aac5a74015c59dde53c0addbd6ff48f3ab21cb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5860060d51ab81a6e764225dad02bfde10a73370c106e245747850ede081d943"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4fbc3e46b17c2097ebe77ce727e88216b90146d0cc6ad8d26587191fd8c30962"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "24eba2ede4f35d11273febc525e542fba06118f1232f4c1279f8b55e10b1ec57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d03c7804fd9355aac8fb3557020140c9f4c9a5fc6263c55ae358a97b4a0fe21a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "28d3333c6b36865a501137d7722435953363524551690e94cab0209aa6a60235"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "99e25f681433650a772ee8de2adc63576addb0b3610e34a5dc455c5f41068403"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1910c0720cb2d07119a9038913a13748a6b5c96088ff54e1692d4896968179ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "52458ff0ac8d895501e762b86ddeb93f6bace683a734ff437e67432604f2bc6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d0123f6db39a5fc923ea68c81df0908c53809d56e0a0aa28e2fc5b4a4365b468"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "108931a0625779665fd3468282ee9542994c924c50dd46154b336155384e57bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e36ca3d4442e0103c9a962f8c3bd6d2a55ae0d6257e989840efa83a63d63dc2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6460095f802bacc7644b1e17c65fc9b0b052e1a745842917ff934ace545aa107"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "adce52ec6ee78825b27d1389a43ddda36cabbb7103a96e06ad9446f8508bda3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f0b60c44234eecea3c508d94032247e1cb954e8ff7c587e1cebf9bc77ee3030d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c266adb33d66846f478b55aaea50a48dd373a0bac2b722266d409e1534f0a09a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b93f21e8f4b98e878eac2ad5f58352c0aa89f7b8aad57f6d05e5f94861b35c08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4cc6b70c58779ceb52fa2160d72276fb732539ef0613b1628b95373cc3df4e6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ee541b5fde7d933a28ac49dd4444ecda5a033372a255070a0118160999c5b02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7ec817a60e2ca45029575f3048ad2716c9c85bea31b4f5cb08256d243df0cfe8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "94ba0595579bba61e7261daa2bf079416413995b0c63c08bed95ed44210b8499"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "17c85b2e19057e9c2a71af80c71bed7b563ffae494dd18da5d13e37de7a0fedc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "42a4580d3b0d7b5c4e8acdbae1c7872b4b9459cccba628fd0e6504f3cf4dd800"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "26e5a3963a06ab8a1ccbb9d6482c0925c0db64264c008eddaedfda6f69e5bf21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a2f6c75cb09f9b54997d873eac4825031a17a91f183c10d7414c7a1e404705d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f197010b411c697763c9a0f1acf23320cbcda66d39b575265e0167e37ebe114c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "489597d9a4eef5b63c0aef474912912f4feb1d68beab0d5b3e431fbb6910befa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2bcc3e6c4fc6b61ebc5666be1b43d30e633fe17948d8d42cd03817d50ed21f26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "87b581923feed3f7360b6d55dd1374eeb970ec6693fe998327ac17dc363049ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b4fc2ee7a50f8ce0349563aecdc00becd73aa673cacd051e7d710f865504c33e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2f7e96d1e0f2db3b053d2de2e761521f90cf99520653b01a59294095dba1daf7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "9bc5e5a39033354699fcef274bea187d06396bead476d381cd25db1e34960bb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d4a2f54cc4f1da68aa1f0247efb0e0b4aa2ccc917cd6f77c7343aa996b0b9d33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c46221f921645aafaa125aa9f4738591e10c72db47819515d4385b1634131bb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fdcf126c754fabfcfcafee1b71bd654591df00f63b233d42c7b8351860bd8806"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1f72eae42bc6cef44e33197061efb28a7ba9b5bcda9519ae1fb18870e02da895"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3d3a1422eb7b690ddab31b74c3c3b13529d8d355380faec96886fe19378c6776"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4e43ef0c43c96ee9c944ffa2f030ef0b7ad286e07d212034e17a56cb66bf4d1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2390280c9dcc8ed3fae71eb06c28424e6142783b8c3692b4b79bff45333092bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "266e3693afbe1dd5f14ea454bbd7fc744e1dfd5720a0b2c0919b5a84f7a9b988"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f2f87ebfddd473c5a6b0e023daf0ad770c1aef09d69fd2af4f221120d9af87c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "92c226c30c9941c9c93638b6b85a46e2839e09dc08dcc28fab52e66ffb0f2c37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c1a1a5a927b9765466bd863c8aa079492d932e330e4df98d4fcc91d61ff261a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f84f6e31718a057db9983af263187c5d53c3696d40e5490d688d820c2fb565f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c5561ec7098eba7ef3ecf116366b095a8e4b93db53f09afeb75e52c518490b08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6dcddb8a8c7a8fa0c1ce92b3d4c415dbb6372af5c14ed77413d9fd0cd0270a6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c98c8ce657058c6022bbf434a5e3f3c3d3a24c184735fc2a1c749957d3ec9916"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d1ba688589b943b5d8a3830cca450ddcb54fa6878a3e4aa6cd225b1711f15cb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c2e24c6f31670ba6d4ad7cae750044473e1a554bbd047e8f9b4beaf270026f64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "09a812e477922655d5d8c6b02fe686f3f8b6e5ef0aeaaf4661cb362d6f9b8547"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c391e9cd5f5a7567ca523cbc7fdc372574cee88c736a002ab9c7d379d4942606"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fd6b4e6451c67e67d90920800da586c5f63930907b81b5142504a4e0b28abdf7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "83f1abdfc3805d8790393f193cbb124a971676d417810931db0ddad4ee7af346"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b3cdef7cd3abfaef6c41ba58d04f64f8682cc07fbe98794bdf51026b38063b18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "4924d0011ed48c418acb3a9f02694b34034d8411ac0bd7cff60bf280851ad65c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "88bd6435ac9819444fa286688b92da1db12c701cc444f7709326210610329151"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "319cd460af872dc60ed177b61087e8894a665fde1cebebf46e9fd1c7a3993ad4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b982242a1e3d432158a07c967df450ff4e722fd5355a782356413ba27af0248a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "321ad269943e9a78eafebb9ffae71ae621868d99c1fbffb9e876e563ff585ba4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "1af97e76015bc8a4f150979fef85d8e62662b052862c336d50e0fa94523efb8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "156091db7f90cf3e24c4b5e82454da392fc2ab3b53b62fd9a8162fdda336f379"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6614d23416f5a334e04c822da9eeb6adf58b36723581967927ae68b28a06cac0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3e7849ec7a8f88aa8d908d024bf738db1e33c2199705983bc7e32de3a3657223"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8c21ceb28e4be61ffc7c26a57c91b2bcd428d644870641bfe45007bf77e680db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bf56548ae0c15f454b0bdb3254b690e330836e43a9b72cf7eabad79f4bdc18e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "2e206638f25b883b5491f16ae8fc084e798b26e12494101651f5dc5a54531ae5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e29c937d19b57a3815d51c6c8e10a458f0fa4aa939473249c35bbb574bd5b9a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "518109e357c4dda7af3021b413c5c92ac2bad3cae7fc5d311e50a687b989d86a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0528dd4b45be8291297d2b10a7d39e6e591ccf4947519bd5b1309d098a2d9bea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5423b6671ac090beae6e7045a13ac667c08643195727a26a3ca77f938164a727"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f6542f30a80b4a14c2a01b26040f443ab5f61013b51ce16c0090f5d30e01efbc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "79945f1b571598743af6ff959cab2b2c41253af773fd4659de388dff4dc177df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e0bbe042a7754d6371146d93afbe8d665958d916d625d18febc1fba55f8fe942"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a346c9a35fb3ce46836fe4e3b2486bb8d8b8d806e2fab841c6880a6102066e24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "34afa7348d77923f2acca4856f8bfaf1d862e72496b49c8dde8f646d861af004"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a99fa33fddc0718f45e070f13ee98fdfc028cebc4ca1506122b1f77f52a2e0cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4c3fe52dac8a760e95f589fa5219cb3c5079a8e74a218606218a42a01e842235"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4bc3ecf205c2aaae526a604f2637c0560840069cbbe9519bf1048720a4acdb28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "965ec8c91d843ec1e98d4b3501928266056f11271deb928f0c2e0f4b8758688c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3e6e1e057cc8271cda99b59b71a4be9410f697fb8e295579ac8a4aff053f4008"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d6de94b2a313e8ac2c31a4396e15edb6e8be152df1eb6e5bac285d38e85d916a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "23bcf3d4ac8cb8a9bae42bcdc0a356cc9ed2c8e1f1823aa60626b3153e8db846"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c57afc77998275b3c26bb5a2dbfcca23425d990800baafa18e7b0b1fc8fd8caa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "88781b3b915b6f287c2ec242434d58957b3e01c13f54908e3050a9bb0e1dbba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "bbc7ff6069c9e7f6dc8bc952fdbccd5ca253d181941b8ac7d4a7b5953c0e1f8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "813564889cb240fc880ee02181a33d5e9bbf77e82cd5b36be2a28e667f755f78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "45f3eca2a7cda3b2423aee6ce7a674ecce67e9aaa683b80d33c0263d98a76075"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "4d0bbb70d116d33c1f5e1c8427bd76f358b91d04edc3b83c44bfd035c73849c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b76538aac86ac2dad0b91437528c7c14eb29e70aecfa2e8ccb3964f1265bee1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1fc60da75cde6222e313830abe3839af713f266927119a1e421250866ba73d8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "83f321b6821b7232dcc9b5f368e9086ddacdd734277df1b416a677533a5287f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "483a662e3b427e50a717422fd60483a3bc8ad8e7b9cb3518ab4e546e91bd7a1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6f5fcf6abd307f5eb5e29ea1800c8b170c46643ba1c683ce9016aa034e9963e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4847e4d00c70e97fd7b4c441928755572830d50c420985255eac122f3953161c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3965e9d6edf07bf9aae806de632be3ee9a24c11faabeac16f3953a613aac7b4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8fe4e166d73c4e05d6c18ab9273c83ee8aaedfd87a64b74cb7275c6db4d968c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "976577391fae52eca72e5efdc1a393e5490111c65244cde56120cc10880b432e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f454d8599e022880994d1c20b6878d60e31226a08513fab28552d20ba7a2161d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bf1bc73e68f730bb47a7a12ee418b9545f350dd5d77534e0fa1de1e80ca1c927"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "cc0e2708bc79b62d006f190daa139c4116d51bec603366b52f6d21740e33c12c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "95b3746f31d0bb2431a5597a641c51b4b9e0f0590f0a895f89cfdda03b88d981"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c94d479c3519a01588f614f0c4998906a73cb376b0565378cb51d132c5d0ccbf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a05f6689cf0a79427526fb49bf31360c973adcdf3338365bb9afe2995a998464"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2cbf6bb0d73eeca20cc6e249ffdcff422c6f0ca50472bfd28f0b12f4817354e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c4422b5616f9a4710d2380733a5f1566c0186612bfcfe567968a041f66958159"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "76fc9da32a4ab39f383f380269288f62b3c6f3e6d1dac08ab3320b6d853206bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0e17025a0fdc6ad1ee2f35df1c0cffe891122b620597a5e241c1bc0f570a3424"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "30fb0f9295ea189a0fc94ef3e89220a17c54ad3adf5de6b5fac972ffa36d5033"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1a863639f0457fe4e8f561882b18cfa671ac7f1cd9d00b862f526f5b1bcd4b38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4072190dfcf2e0be287ec3d3b432b1ba93374d169ecbc4dd8a0c3d5b3167c723"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "085bcee97144fa256afb6c2a7bbd4f8078ee1dde992ea5b412ec381a73a53089"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "01abd7c90081300535d79e7195bc834636c917403a4226f9ad2054b7eb1734f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "404c5b2182678396e927cfb528b875bfda3e25ad0b40f9ea4b46877b2e1d5909"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a8063294f230ab55a119d369f0d8e6270b55c075e37abe0cbc9e5fa9ba4820dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a40683f45eb54e91b7f84eca4934a9edbd28dfa748e6c92c26d5ca8133157c4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "09ba9b2c0b0b2ba780a1b3b4afb61eadba9c8fdff11aa8632fe4a604f0203469"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "883caf832ba53a69b4d6de05637c29b720899f1c8ce3c69f728e4e360f05b0b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d6f6ac1d54854b983379414e1b67139f0c8246b4465917368aaf4f76f110f11c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "65dfe61e05018f4c4f52d5c4b1777ec4ece8c0e67cbbf922f2c3b2b40f1168d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9a6c646377738e2570f95e99d739cfbefa91d027d2c0be5427753632444ae8ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "dbd4ed3803219950e337df8589793d021cb215fd3f9e90ec348b80624f62e256"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "df89f9ee3822884303de3bcca685643aab374c0fde1314be2795d93079a437db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d106d2dcca2914073e055e4112b040bc19fc417a884c72882db62e2a755e321d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1e2028a1633f4e678667cbd56013d3e6fc5606fa9e6ffe8b9c6ee09aaea88fdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aa2d2f2cf91f6585f1d68acd5c750dfaea3c5e354328983d959a5371afdb4741"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "392d4d3d38c7f39cbf125d95a14e9423829873fd87313fddaa7a4f984b7e2628"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "827e803879f90b331e37101c710bdb7d3fc6374c48d33880588e4da0e26c0ec0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a4b305c8cbab00bb992c56602e51d74569d7c22843211cd7d180a97af13c481f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bb17fafee027f28ba265fefe003dd6ebadaae93283227bee79fa56e33e340652"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "09db949019667c4b117fe28968d7be3386561e73d733d7dce651b908aa4f54cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a9d53c9a964098dcc75792d835ad003fa4f0cccf6da0dd9ff2d34876228f99b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "293acafe22485810c6e06c97f40f6d04a39634ab6ade9533599a6768b8f6fc1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "13b7337d776f97fc4e2a9988ed1d376c458cd57dcb452782b0fa9f589e1b4798"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "0a9be0335ab3a6182906670b5d7ab7502b642f1617c51c16dde7f6a7580c525e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6daa7ea88db7347c4b9b6373652ac43a5863dfb1caddea6a0344bbf4693039e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "cb492b0b9ebaa71e767e2f9ee661285847f189448a3b3d49be047c3fbe8c91dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "5144624105a0f04326860fc29ee4bf0afa19241d384288f246404238ef2857c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e93c1a16220400d6c421ab980bbc9ad26e69fae9115633db9e3bad242104c99d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c93cf14e3ebf3d24076ef26be0b63b37e04e17acf9cfce3a5e13cf0d0c052ee7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6118093db571e55b25936e8992ef1978a9534cfa63cf73a121c69847f6762f28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d9933b8424e77d58265f4cac935ad3b3fecd5207205e98a34764845ecfecba42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "11c84efc4fc660a4da689aea155ba71621b054f3693af325b00bd854d073f9ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "27a599578385113cf10a80a98c6b5283842f22f29d3384cb6e2224a79a31e9c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5e4a092d3ed388cfb080520665a99b5d74f89c74b1dce0578ad8e7744c2db0bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "333be7ed9b9bc3909ecca9b644cb70275e4c5f4907808d89c467cfd7c919bcd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ef8e699a089bc929debea3e900591280ac56fba6f44adcf57e6265a03f879051"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2c72fac5ff069c96d3240984155ad7e57f57a1c2503af8f2da4a618b825b1802"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "7d3d16dafc01ec2b5dbfa8e7b08878d45b0850f97c477c77b4a7ed21cb6e1965"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d3859f5d7f6469885f6288d56a217143d12f49a2d9b4b01f63bf7051e424bdfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a3fd50fcc82c88634a04424580735efa93a31b9ee921441e2510b6de36458a05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4ac3b81bb7283095ee87c0bd4dcc7b482254a3f5517a30af72f46780c3145ec2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1bdcf9ef666f1e8f0d17dfd2e78057088aa34b60bd9351956bb6b43afb18cf40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "70c228b3fd724c0b72022b18a5e179cce29bf15714d95563ce750dbc7a67ab3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7f985ab06d48a2f9e26c84c03aa585f376f8a54ac3479f7ae03ffd3cb3cd064d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "1b1872d332fdb5cac340d87b58978dc6a7d41341455239d9b23718eb4a6fb8d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4e8438785074d5b920a3088212f17075b1f3c6df2463bd91892b66e6f0ad4476"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5e0fe76c56cdd75b3401fa42cb80ff19fa0596ab8bdcba90b1eaabccafea8206"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "dc018ab1ed58b3eb052a90fe71b660582bdde2671bcd064f2e2c54e8cccd1b9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e5cacf01ed56a24798403f83acd0a5cc3ddcdd446b4fed9a0802053ea50f8584"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a8d2734d5e8748f84456506557c49b5631d77b331c3d463785a0bc0ccc7a171e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fba0709001c3e5f1a9114204de91efe3ff5a6ee65e97cba54483868880e2ab42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bbe303498c6bcc859cb3eb8e3f0f4f8bd4a8b29aab73375bc0bf0329e8a80322"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "93ae5a346aa77d59ed6d5d73d9338afff940b2767af6f3cb389e9c8eac424c16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "19fecf9e0cb9bd1f553420d464d216e21e6a2c6b63c5cb3bbac46d0b52066522"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "da4d88c9e5e0ee25f55e7860e930c6d5b2247324f5895f58c90ddc3acde0f9b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a7541923e3761b511d3daa41319b35e1ffe05c04fd10c34feef1816baf89c3fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "373a9847b4e150332e6328d077a434a3daf7bfdbbb0bc98d1b210c92923f3c8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c0dff0053dc9f5041ca460590522093388f5ea5615fdb9fc2c27a55f87d0242f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "849d399ea4e3a65dfe9193fa1972314c49aa0e501b376092fbc1513b25c444b2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f4fb15412073af8feba51725f4f4ceff320eb3bfe2e50bc6333187d9e1e6d0a7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "88967da637d4b2e070edc02e4816bed10e9eaa9e4da691f84ec928ba0fcadde1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9cb82eaa4da4b45e79aa6ce608733b457e355057116266ff31ceee6603039738"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "5c14548ab880557f979a1eb1dc8235487007c70786430b4622f3417bdeec34bf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ef5f7c62c68b795f5851452b1c6e13f2ff50069c5e694db6fd1d74e3595d1085"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0f9dd179af6b0d7ee512303cbb077f6112a7bddf1b24f37f45190397cc51833a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "db91094b10d7c306bbb5fb3607f12a744c26c44664cb4cdbf0f66d385766e6f2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "2a3f65b5d2f8476c322addf5328848d88d3b9d6504598cc5434b010742f826bd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "36e63a6d413cf675ead0fd412880c218bd5414db5fbb00fa1c5aa67d137e5dca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a864ee11e8023f6a433fbcba0e5e35ae0f7014a32a9dba4d82eab45183abae51"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4bc89ed4798605e79babe603dbf523169f291dd11801b535709cba260612da4c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5db67ebb10903c128759069302e998e747458f51de2b9005205f2d4815b0c998"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "37a0097533bc638d3c7ba0077dc12b9ca557b1280404de4b7c4e24a84f10da97"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6e9f3f3fa57e653b5036cea035fbff40e831dc607e6027c02954426ce484ae65"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "ec75333b9bdb549d75400dcc86f35b2f8defdb716db0085762e8dd127cfce34b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "510c9ce7e1c59999d2e2411af632b30ddee9d04f993b89e975029a9dc0a9548e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ba56756ee66bf2d07c9d2c1a2dd5e98aab54d50b363de9675bb8760fa413daff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "077eb9cac7ece892ea5a38a4e5ec8b0c81face5364bd291cfcdb32fa6a14e0fb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c9f69953cc8fd44ff8b6ec94b261a00f33d80483e9e66f4d784b77a6fdda4cb0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a0da587c1be47309a3a3c0534ea6c166aed5d79d11ef82ea53f9c3fadde453ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a629740a3162cfa28667dc01c2a1c84ff6620e86baafe7d0cb4709e9916aef1b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "017eae85d53357f648a91eed0598f7b0a0864d3b7c1bc39dbb76cb8a9ada0d9e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4b37d7dd3adae80d41b1c45b1c3c9b7722cbaff147aae8c4829b142efbf655d5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c786f02061398418317fa1aeca3c9d1c87ac681750f6de19c41ce352661468e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3c1dce0e678db67e080ae10e1e194b753901f224530aa19b29ebb8ea3e782057"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "da6dd8756a24841ed4cafda98e2b0d1bc09d9bf3c6bcc0aaceb62ade6b926498"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a4236623c1d40506cc3c92f7e43e79f18b76f7cba527d60b9872d24342dc1508"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7dcd79be08409ebaebd927cad242660c6864ed41726393c3a3856fbab3987d2b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a18fa852735f9630d28d060748e65ff8ea45bc5527ac7efc06d2a158c122fb02"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "447ceeedf83672409f4644af4e32b1dba6bf88b0d1473bf70ac0fd5c6bf7968e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "e8b2e63fca559ddae502dcc67f4b989079c85d05ae281f3b0beae5eda828f211"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b1bfb620a3240b68ceced2c2fb9a2f6f7ae6155df32ca322e043209adbbb7331"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "f603f3f43f8cd2bc560c833c053099eb892d370fc63c1ada74495e2d5c88c6e6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d7976269c63f11b224e0b0e2dd467f4fabc21a12948e03a5d47ab962d2b7cac9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c6e4c3e5521b1110ca6692ea2dd5f303a8c3fe5eb32dbf0a48c5293ae69d0487"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "060a85cbc342d059697a9cdba0ba7a073681a840cd5df4de830ec6d605600e6b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2b1e0cc98378314e737fda7f5da1d57c1582a57aa2bf347e64b29acddf7f4396"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "795bd696d744098b3731daa03a581d2e339eede7190097301f6f9a21e62e9462"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c4bd197642e3f65df050a0db38c4a51bf1833e076751fce8912a92d7310c483d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "dcbf86f13d7d063e4c9d247111a2d8a4efbfd578359cb79146ddbc1cfb3344b1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5efbffcee453959ffb0e361718808965623d98230a1e1127e22c44b130cc18c6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8fb389ae86c21f3a17db2e2a5f5a31735412c8303c59b449f21fc94f6dd26793"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "96cca85188b26e9f858ec3d2f9595d831fb8ec33b55fde5d7c9b8694ca424ff0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "bfcbfd67ca2d9749e044b744f2e0a8115bfa98eadea16907400658067f3f6198"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "75f2e989dab0ec53b9104eb680e4283865916553c5aef2d36a5fd1cce49e6452"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7856975f9c9d6dd9059a27b8619e8c685d5ca5c377b7faa7f35926b582ecbd55"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "420cc6a93318b6eba4dfe599058e8f401a23a22f3e5753cb13ddfa20ea809522"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "7b8a6776947156e914e68b3f14d4bf1d7dee6c84162c8605a6a84e563baf0685"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2ee4147b16d067b363050b1044c434501255fb813671fc94733de883d4ffed62"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6706fc0aa88d816c2691f309cf2fd190854342ed4b32a6c9a0f0c5b296cc165c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6d28763e2a09337dc2394cf84e50e90335f9239db6cd11890f2fff3ca5a4d6db"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6fd398a7525b05af6b404750934065ba2267d1e6c4489aed7e14d3311265911f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "9aa19ac6ea54802161b05fae3298787cab8a6c9d4ff0f2e8db7fb0d06c43ab53"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "f94860ed91b559d343b80e5c840da55fd2bd983cf90905790d04465c7e1e44bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a1e2cc1c04ca4b90e39fa9f3a94eaf645e577b67573b8379e79b9e9e1f5a24de"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "41e17bbdb434c67cf137d280de314e875b28738fc4b656bf85c70751bdde9fe7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "13be3713bef31711e507b53867eaf3d5de28b6c56c50347e6ce104090f919d6b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "8cbb7d3ad24b62fc32987f1710412ac7e75c78d07ad3a335ae4f9a8fb916d092"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "8438ede4d8c85bf08b4b3d550acdff4e3c6ceba8a32e66cd385501a38d9fea25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "418991528770bcd0fbb2ec10f69c5cf8a3d4333d256a701cb072a46899aad165"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "37824b90b71303fe0ac40e91d73945605892e1434dceb6f11ab07e063a506317"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "3e191c7b1e7c661b28167dcb729f5a2b74378bc60f361daea69e77f96c499200"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a87fd60aa4195677d9de6169f530bc110ba88b5b9db7f735e31cbe749c03dbb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "b8204b8356b002ce8f1bc08374862c0bb6799ec46be9ea9bdab386e31f4389ff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "91a0d73fc60b3817440ad7cbc3b717493a93d1ac298e6ef57e5875188e566278"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "140df0fca4d059cdf82caf212b45a463624eb90c27e99c7c8f3a889c1fa1e34e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "90f7271808ff9934f8c96f5a03cf42dad93ca8c85c02d434739fe8ab3d217e67"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fc8bd1756b080ea0a6d3a915e755b6647516a9566fce18447237b80e11f4d3b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1574b67a40a965667ccb9d73c8eb61e1d06e2182d39326eb8c884b2ee9322aa2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ed2818a2a54ec02db7cc6630c7d07bd4a824e13bdb2156de76bb1d2185fe19bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "353af5dc98f55ab91bd4f57d7c66ec6c6a7d82bf3938dcce6113c109c871a972"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "47be222ed9af3af068e137e1cca39873229b378998235aec89edf28b06ab477d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c30e0a0d616c127c3e64594c2d493549642386e6936d773898bb5f5c19c03391"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4582e6b091823e5e18aa693292fad5bebadaf6aa58622c76ebf1d4bac97cbdc1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "00bce2e4076184a6704af6601745983b87efe10fa9da3e9a4381a66df4c9adee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6f4629bf2166ec119a93cdbc1c91dc853293df0e520596e13b2cef80f2aa6775"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2caca3fe65be20d8d332e7aa3f52d1843b8e99ecbfef8e1a1552f8343c92393f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e06ad9bc772f370f90fe039ec7fc55f42ca41ff1e30c352be285fcd2193073b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b24e618733db4ef0efc5459bcfb9e67e2f6ee198aa28a117d3b4d6a51cb0424f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "10669c765f92f02c64a5caa446176ff09c987763fb6a2e570d81abb7a62f5c75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3e1d1bc817c608b64add737650962b71b19dacbd07c0541957c8ea82be8a7462"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dafc63c296e76b32c1335807f8de627cbd54a28c619583bba9f0d0c7cdb0ff80"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "23a8d113524a0b0fd281178fd1fba584cb98368593a0d6282354dfd5dbbcac76"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "53e9a4acbdbaf345982877841bb9ab2f3ea595071c040abf874cb5ff948d2129"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4c0df6761c017432b4226aa12b4a731b7f3221a423459e877c74f1dad9818040"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5ac7fd40b327df0a3dc0cfa44c61baf8e37a8300ea558be2345a430a335ddc7a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "acf752dc72bf6647a0e0f025bb4ee6a70968f82eb762687b4ee3356511625cd0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "6e3b7328f676b3f861165f18629a2c7fcd57bc726cdc530f08ea0c545c07eeb1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "c6bef7d197b122113eef52c9969e1a919f8217869cf89e0aead418418fcd340a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a1f4061ce55e100d4f371fd5f05af79a49f8304d62cc0b453a9eaef8544b486a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "bb86a30ffff2584a88127576f880535f1c737b3bae49bd9936e22103ebb6d73b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "080d3cfe03a03b92fe63e12fc3d964afc654c7ddd7e1c401afec5ce0093c6240"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2cbee519f50b74a1ea84db60a27d631aa55a9b00c596b12342abc9e9f28cb6ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f39550b6f6c5649d966a30811c1b7a7cb6bcd00af008dd379dc5fefeb90a8091"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b545d68944c36e85eda010f1a89e7036af48fab970b8b389ffd51b9b07eb4243"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "5886b7099e1fa30d4ff0cdbe7005a0bbe7af74471ebc1488f1405de15dd6ab5c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "70959365e26869fe2ecc309e82ac795068dbf7615b19502bc9a65a45dee7e97b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a80d2d79d7b8d5d73f4d5bb5bec94e337f357b93ed023e5a748c10a15c422d87"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "937867d9ac647a6b2173d244f7a69fffa0a5d80e948594c84f1d7db55b783087"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "82b161eaf8fd231b5efe7c1d6a3098ed46fb38dae9845acca91c4c540f61764e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "a86785e5d18d1579b26b16bf37a0a5d6e2d0ecd578648695bce901bec60de537"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9a68157130597e8be412da4049787d65036b1764bb95c5703bc6a4c5249f36f4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "732f0cb960c638518b20501d692ad1ddb814991542badf1f4744f68f6c499bdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3f80e8f27da831155c0e0db7e0519936c50900fa8c127df2d1db178dab5e4a7e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "cffa26f076481f69f5d2b9bcd926e421469a88659f278156ddc541a48b4a084a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "84305256c472d837a0ef1982037fed9930e10308de5f7ba9c1c50e9cd887c161"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "92555914dfea05620ef696ac0dde3052e83a53b91c7b073406b437ad0571906d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5641bb8ecef85d82688a3870cee4a375ba8d69d9ca76a8948f3ebce553c6f25f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "19490f9faf2e31191a6c42b430c61cc0af0be4e4e5015b5882e7835b76a9b28d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "752f4ffd3ec853bc4bcf2274fedef866857d4a46ebddea4fc0ff41e65775d910"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "86942ac7ab2f95041b455babb63d9f0fdd1dba953e2225d39050a496252f00a8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0ea4fde308be11438e1751502440263dce9848ca2b2a3d9c2a654157e2d6cb7c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5a0a2001a735f5a8b442a8d5e3827b623e47e551ec9b895566cc1414d1e73ecd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "0915ab5891d5e2540f219972a0fa097d2a455f69b391cac344eb0ffc8cf11cbc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "83f8645eb339144ca5046e6ab0b8c9ab6f03eca54eeec1ce0267903dc37e760d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "0541735746ed85b1ad482395d9f173e8230c0af56927653608d649ba2fac9ad4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "9e602b2f9e207a2a909208ffc14218c13be5f37b4f09d922e47f811ce99e0c6b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "593bcab5b12a9c23ccd44d54e996265e09302c461bbadd6ed1f0d1c0a759039f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8f5edd5109786d3ab57cb3c54d237248eaf9685112e722993cfcf2d16142be29"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "7fce2710a27d282a9bb909a38ebb853ee68387955e3644759b6428c06485c2e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0357e924843298fec0f69f0f4328d328db40b526f8612dad71a3b041f15d7c23"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9c5b51153334d796e35a3135a2c7503c69c5647d574d1ebdf5bc64e729989367"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7c3de1d7b852461a20a442ebdc9e4d5dd2879df6b7e0d39a5d47b34ac379e17e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "c39a65b8d01e4ce2c2d9ad2fdc366ddb3f3b2826d6c280e98b32c479dbe63912"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e116467ea71cde0efdbd2f52f09182b936e441348c1e54dc32e09bfc5359a58f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "99e68b683c152421c11b734d5d1f5574b3f27fa413a991be5fe9afe2644d59c4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a06a1aff3e910a8307170792e0eb85fd587438337ae9a6570e25c2a9baa3b3c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "780876eb8d1002350f1762b1df257aad4420d9f330bcb11068fae0c14493fcb8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "8a2dfcfe4c128f0c108dec06e31eea0dbfb21774294ce916b7e87125b671335f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "34b499f713725c8c806bc02902c76b0a1b82c477858ace86e04d1dfd24a88452"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "63060f52b08b76719a5604389d9bbf802c787b02b71f308c4a303554491ce9fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "49b86865268663c394b564308d201f5745db48ad52d3a05ffde724a9fd94daae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "69c7fc6b368a0de46185bd8013c607919e423f5f7533a2165a15ea3e2447db66"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "da2560583c8b6e8d3c02ed057a79144bb5b726a3a3e7e0bd3772afe1b5189e32"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "91af52246a6e2d06c18f3daabbf07abb1869e7df3e9665d4c723f2f278f1d000"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "06d65247ca47a979899ca7e61263945fe461ff7a22d90939a5d78a6162c32d6f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "a399f0ba49596576c7f5e6f1a8cd71db47838382e711f2e54401477ec4797f4d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7a410f964dee16e36c88994e58e43d1ccfa9da86a8d5588395807c622ee73e75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "75f9424b7faf23f21e7c07e640091de4409d86440f43ac3a232eb11a9934c17f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "ccdb1f66bf83a3f8b4b4d31abaf3df89fc1c487819ebba399c80e4f0c8450cc9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "48c49beba22967c22d31f39457d14cabd7afa9a06e692e54391d1560908cfca2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1138f0912025edb3cc72dd4a5147ef69ef0a1f8b0a5f9ceb742fc7b9350cf78b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6fedff4b6fb90dde62a70e0dc015a114c718ca4a7f7f83323ff1b034e6014933"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8b9dad395426adec314297e62b80ca6da147843819dc0b0a1fd7338415035282"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ef8c2791ec1e0c8190e2e4fea1bb5411079771f7334257aa6b461845d99cbe12"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "25824da02c03bd5c034b985676ad8cda18e971c3dccfa1a631d3abe5064a1d7c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d2237790d2d25ad1ca59a28214297fa87ec58a69d651604d4b45423baf6eeebe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8d91c2f2f23df4836e51af0262902bdd041c218fc855bd5245011a90f28b6046"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f0f37ac350be776464a7de78719d58f4c71f08c90f25b7686136d487b4e6065b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "a32c8e485a8d961ce1b498fc65a1c607c7cb30f2b7d577a4847f5e6fed02f0b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "3f6e5f3d8742511654b0a56bcf4c3bcdd48b3c03d1db7f308f3418979a006676"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "9efe71c03a29c2fa0d16aaafa02f891227bdfa54794dd3dd67e8a922b554b10b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "297acdbd3a484f7e5798cea7db89beb7fc8a43e825c9e83ef7cef1a4ce1044aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "d44d33b23b16df41b07f6fdfae878c5fb862ca49ab285d0f052526728a2bfe25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "9326295df05d8f88dcc8e424108d5884574b71dbd938526064e06e54d4b1b2f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "963ab5f081ef614ac94ad54fce270decc580556f6b4337672908475ebfa6ab62"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "eec596333e2661f9ad4a828f03036a08742342e13b49df4d4f2fc44b970e50ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "557b1af8e82ed58a51e6857cb518cd6f316b6b724693d5730ffef44227b2d041"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "bfed4048068413ff095773bafbf7b6c58291e44c9e728266354e2b8d07ee54a1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "e7600e8636f4eb3c9cef4c5ac3a79ab1abe3b77109a52650a4eed76ec9426617"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6a40cd25fe4b07a086cc894902eb6b1b3d44716e4e7a6e60067e1e7362e6b15f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a06704c84a0b5f3ea052030f25e2831dd5c631afda2cacef45b79ae248ce9720"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "18dea15ac373faae76482c8e73b131b1fe697494799b259cb78e189046d9aac4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "00e86b60b597c6ca62da2aee11eb7d87059526b9201acb34c573f1fab9ac4982"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "4b02c9c891e37f17bb548a7e1e112f53c39c5bdee14b023ede98149619526073"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b5d308d3f720d79a37d78591a61e2f2884ec026cc2f9961f6d11022a6b4805a3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "88bec8fbc5f5cc7d752cd6d700231d043f4ee4f195112417f31a3f1adc157f4b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "b10f49132e2e10e068dbf7e2de0bd0723b17b06efaf6e48d1624a0452922e8e5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3e0dbe6a921d7120fb9cfe042732115465751bfa80a509c6c06c0e51813cb92b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e3e683b96c32f5487d83a0632e4b902dfb9fa5b075b296171d5784c7bdccac64"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "1677ca11ced9319608dba8a56516ad28008f8e5fde0f3e54a60b72f40ce2363d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "512ac4fc857d6c44a1f5a322955a3d8f9494f30dc98109c66f9b95e95e2f6fdb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9debe48cbdcfc3058c35afd048a7dd7ae5e00830607a91b3767312b512953c50"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9d55f7d3a215d40ae6ab8d82f51cfeffe38d89a0ab5ebad2491134a844d093e2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9810942e45cf43fa13050f0feca3684cb06554ca23774a7da209e7f5a22f6339"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c6a48cdfa5648e1237928e75ab8a4424e11dab9c621cbbe8284bfe592f075e07"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "66bb842b47e86adbe8e56d5b883bbe8bd4435c82fba0ec002fb876c58a92c321"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "e3841163ebcd352a34c927d9f9b9f75daba785bab2c086ced0a1d17d44a00583"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "0540035296582c9b7c2a275cde4288e245d7712cf782e9053c653e9d79d48535"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "8559442e50e7e5958a29da3589088418afcc24de5d94280e466e41a63dd4d037"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "e230ce47d1030e5bd85114ec77decb2e6c768a0f84c12ce237c3ae2463283cd6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "1172dd4de81a00bc9ffb08effe4df6d2d1c75cee6de1ca47b615275a01fd373d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "22a1ca4f2a26bfebd1c108ffa67b93ee13e18b8fdb48d1920ce0b6fed67a9a19"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "bdc16f09dd632e128a16bf54ded41a4c80281825ab6fa0bde054865b53008a9a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "20320a4f2553399c08b7faa84c5543e82adf67fcd92c841abdd04d4608ede709"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1e16c87e534268d9e5f1bb564a69974fde273f6287ab8793dcfd8143acd1ffbf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dc1183a6f569422d56708dbd43e79e3bb80bd38958eb355d93c26c813839f113"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "13e559a26b82fd07e4953449f22e5294545eaf51d6a856f245e6ead971e92a99"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0fec697d5059287f76d2d8bbda2f206aed9cef79d6db9406af4492b62225d2f4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "91147765476325e4eae0f53e6eb868298bad30fedfc339f08f8cbda5408b2818"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "2266bbf85070e1c3dc29dcd1e98363d572281475476349cc7500b4b93eb6916e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7456472070be3089e264b52a9c544decb50d4c2f345700326402e90e9e353590"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e50b42ea93749f62b155e995d46a588228d3b8dc2f17b7ee888e2f7eb44502ed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "acef6a36517173528d818ab57da2e58032e2254d6eb76e6be4a8b3229d6ee6e6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3d3b731a6f663145f909537037242ce4eaf273ab06abb584d90a4358e3cfdf1f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2efb659b9cb070c20994c7646f398edc105c82a3fb101d15e034afc37a007a14"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7d18377325b53cde6a1e93a9ea2d798471c5ea2c6a93e2c68900737719543310"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "dfc9c2a62befe44a47b49772e450d5daddfe17f5a6510aedfcdc018d953fe0d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cbc676ea993254a5cba72b919408c6d70bb3558935b3b65aafe50abcc4af34b8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "21a7eed1c02941f7a6a2c3ebcc8541b9a2e8e147a6ded6c6c4dc1fc8c3acf841"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf292d91f4c27e6e4b16915b3a41aed31fa36f201994b3083b10b1fdb3bbf4bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "68a1a7c8706fd1b4e8c776c112f66f68c3403b4574b0a5f30ecc35f06f87a6f4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d721980858483312f7457c7d5d0b89310e40e4033574174bf57b9c5fe2a80aee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "1f93026d55a93d9dfcac0903aa06ca52b7ab060653f943e7cabedaf3d59859f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f186457b1087f3c6478b62c23faaf758469a79169d722e057f2f8310991a5d2b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "f19e934df9e08e38131e1eff207c56d52cdb25f9ff7c54ca011cb9df78a614fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "c2333e0a23f20330ddc76cc65f4dbaa62eb7df95f98644a2be8cf9d0871d0ca2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "3e897c4d3c527e4f356d764a507187638b39fa7a3cc05ab61105b9d0a23d5cf2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "5eb620c8117694bb8e4f559f2a136fd7f9a3653fd71936e7505856b2441a7644"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "279458681b484e9452a3dbad65ba85c1923f9640118a5f09b074b3a1f6ec9585"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "941e0bb333715fbe78bc24438398042ecfb454e703293eacf76dd92a2ddf7bdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "1dd01bddd8749bd398d41410b59c7688dba2e7bf3a648ccdf314e95eea8df807"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "e38d689b83b2b451e36923a266cc2ccea531f321df6ca031bec653d0789a1110"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "1408d011a64a48135eb856b38a27dc725cc1476860fa207a6576cf58766a3b55"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "262920c1d58d526267c599565358d6f102a3526df98d25a97980a04c9709a18e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "abdc99f71e9432bb9b01ed39f22934957177b3e2618f89853a3370da82b9cf4e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "4e36ae394756900b305d07dcbb58f86a0b487521774859aad7ab9cad528e693a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "317fdb661fa1728159002123c5cc623c46123180f38c4f7a368aeed12416447d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ca52b0d139390b1a9948a205af0445ae2d6e69b51dc1fb9a6d69bf6c12e4859a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "e19a109fbabb8a25c31f1251bd0ffe07938ca41a38f4ea55f25a208f631e762c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "3289fb8d2e26fe322c9b49b7e9735b9a39b46f94c5fd649adfa4c7f6985a2e4f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d94e5fcf4493bc597a5e58daed92790c033dfdf684c103dd7a94ce1eda2db460"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "f1e515477c36d2c3da5ee9d00bc3d594cea863ec6bd968f53e377ae649509a13"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "933b69050111330df003bf2a558c18fb8af9359075ce8146b15a4a4dec0b0f2f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "21488fef530a8ac76c2e4ed388c1521f2e054ac67ae7bc97aca65bb5c8f06ebc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "30f8b4a3b30cb87976b6b72ab0b27a731cb5a876e936f994c70868159c56d490"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "561d6b262161119e09994299576e279ff62020d58966de71c47e347fb83dc9b1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "feb76cfc796c42c1a8c2973938d8297f04b84bd157152e9f6b8b4f2c6379ff39"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "37def22bb903e4386d695c0b086ed7b429110e51ca12215385ecdb0286e5181f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "cb408f7a2983ddc661e8a2084541336ddd380833c9787562260019e74ab3577c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "745126c09784fa591fb9df72699220eb9456d14f6d01e8d26a16e8d5190e482a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "17668b088d73a6664cd2ab4ae3cda9c3f25af6e5a88125d3cb8dea46a6600f48"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "34eeded300d30df0efe46e08d15f3f74d49f78e4e66b6fa68787df8d780a1323"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "31be48c882691c51d5f030d7cd29978b8046370689b981d1f0f99899ff0eb6d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "36c67af5cefa42c72f0fecff1da98eb8cf6eff5fe93a93ef299e570e75d79029"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e6cfd8e17965be68ad33e9706f7d47b247f35c29498a3d259aa153bf8a2b0448"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c4c765547e0b1e46ea7b6119c721fdd1cfec25fc3dddc382e480f9cebfb20787"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "3e251ce23fab8cf5cd029a84d34e9d071f3900cfa0930b04df36d1a4e12b6464"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e351a433bf79cae60501fb821611a6bb319fe3ba3b28dbf4a3454dbf0be660cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "d3589af3d86ec7f947410d85312b75d6ade5382268b93fb1897f87d29e643eb3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5fb183a414b9fa7d8f327942f635b8d356d851077afef60859bec3d5bc00c174"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "d6838cadb62b03e4a17a5c38073373cfccb5593cb7659b95a801129d97fb73ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f2433d73b9837dd0bef75849bd125e5d15b3327a75657516d3358290eb08bc39"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "6de84e832f95109e6c00c76a9ea8b000f63b9ddfcb42af5710070f683bfbd21c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "82c5497232a8af4fd9c7e04f0e22f22390f2f495eb946ba483b0584c17063dd9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "94bb2f0499e8ec623a011516836b2ad05386222e19c5346f49437630c1181c87"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cd3f9a1ce97a170fbacf69e85f929d25c6d64f65006f95d39f4e5371c6ce2fbb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "e4c7ef956a295f4b32fc9b216c8ff3f07f25896aed82691db80e9ca2b219f290"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5a112c9c471817ee9b9fedc879cdebee31297d23bfc9f26d0f1e1e7a3f25cd25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "40e84f0dd6c98ab8c3d188b39dabacfcee406512762cbcd22e2c4e438cda640e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "02aa4fc070b9b93f9869a4c330f00303dce5cd2bbd7c438a39c123f43a9df30a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8554ab4f7f9a969602bf66afaf6c7e04aae4e314c7f500db52c58132f8fab254"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "fe5d194e6ad1d2f20694096e25f6526451d733351bda9ce02bfbef1344b58d58"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "17e9d011adabe58617a7afd5d1663df163fc53267b85d632dea121295967c8a8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "fdd31f0d3c7fcb5348aa68e8a3deb235592f0867051d5c02ce3829c442f8c444"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "4f81f0beb6210c91fefc999451648eb1a4d9f99eccaf3b4a507f2810534c621f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "f5c515238e79e0c835ac60de2cb16cfa15f659cae19c225f8b5868d9ec1ffe38"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e9dc3824f3c9884f8944294039b8da2093b3ec4f35581942673662aea3d14eef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a6c4206bb8326e62fc243dcceb150c4b6fa29440d2550defaf37f25924c4deaa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3957b772ce6274590820ca62bd0bd497f902c1acfe2e09b806a23bf226450fe8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1c0ffee0dc6785941e32b50e192aa34b4715a566c0c5460725f2d1962d52b5b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "477559f69e0a6dfffc50c4d901afec3cfcee7dfb73bef8ac0398280b06c6297b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "bf3ce2b126ff907344f9e97e86ec151b6335a9396131229c761b3e8a5b68164e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "fffab363cddcbed8437b7a257baff5a65878d81985da484d14df9d05ed9a223a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "c38be994f6ee91986bd709aeaec4351d1472c3526afd040728970f879387a330"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f9f5a0cab56440d962f01a0efa69ed8bc641635a7bd962264a1ddc3ff15ae8e6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "ec9ad8aaec88d58e6c65140edf68b864dd1b5950e8440e59c5a9ad9dcc2a8f20"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "b3cefb417e4819122a8271449d322b99c9aeaec488dda43d408f4084da29f33a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "6f98537b47f76e59b7258a7e7adce4ccc5efe3c8352955a26606cab6f24f0d5a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "3ab67486ab0fb0071ce62c58299db7006ee96e298f7dc794f8bae8cb36f8ec3b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "57d86b56d8f58dab2e0de7a6c35fca1456ba979d9e173dcc5481fb155d08302f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "c41ee3b276593928aa0706c8d0a628b950e147bb6a766258965e725dc429041d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "307c13e92795db0ab99ce58c97e2f9155e0a34f9095c49cfef9ea78b5e9f6395"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ba08f284962045d7e79317a1b64d767e6209465e185d81d07e5711dfd17c6b68"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8308c877d7b4cc792acd6bebe9b6a888763b2b1c4bcd27c37d7749e8298c58be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "56c38feb9bbd5b858f5b6d617cae2a335cb70c7b54bbb29e06ec07bba3f121fd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "960d73cfa7bf5238de43879b5378a138c0e43a2bb29e350d75e361aad29fafa2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e11a564528abf5e96eee0e0a3ac5d4847dc9a2f30521648dd74a56fdfb4cddc8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "23b43b1f4dc5ef00e70f3360d8bdbe4fd8f9c294592494e8e2748dcd849582df"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d0dd062cbcfbadd43e1771952af96df1a51c69a2d901ad5c74dfb56101dd2fe5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e801eb60c9ea39b3b997207e90d292ea18218dbacc516d201e0ae0f040a943b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cbaee12388b677b01ce3bf00f64baf24dfe3026bde5b60b2f39a583c75fecee7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "30c58f1b9a1ba77e7776add45b4d0ebd14e77f7017d2e8d07510aa2c62ea670e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2f99dab4e7a84648871ea1952f22ceefcb0034e444063bbe1a848589a109099d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "56bade072f0edb6e26917452d6af295ad1fae0064e9e586f34a180e1e2c8ed6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "dfb916b47a2b786a0e1ef1de6fcfe6d551d409176200cd8ce2390f12548a4291"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a81a44055380ae0cf6cfc37f478a3554dec540da4601b3338d8e81403bfde90f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "8b6e5a9428a10c7a2fd0ace718fb162d021babf1e21ceb026056e0d8261a83ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "adadc77217f873773d4b3adfb756e9f5774fa3061be40ef562ef41dd1f8e6028"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "37f0f7db589436f5de179c72554888feacbd732f4892b10dc9759399d76e2cbd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "adcfc5ad595170037dd82856c17bad3bcb0926f1b8294a593c4d6111d5bbaa99"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "58beb4a7010ed0b3c5c16c50711178f19a57f8c26cff48504c1489b7e92f0f94"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "5a0bb2fa8434ecec07dd2cd33fae913beca284843c1ea6240a9c3456d24f61f7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "708998e5c5722a9aa5b78fa0eef90a028a04e0316111da3c5ccbfc86c829bc49"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b63d571aa408acf4888e170bd526f5d50d2b329293148c54f6b4475119819aa6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "05908eecdf8bc4abf056cd397033a73dff4fc7ea3938c28672811763c706f868"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0ffc84e14c430f6c89f53fc23a7ee7cde5def939d8b768d932314d079994c382"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "d33b775c4e35ff3fd82702a2ba35197ff7cb1ac6d71386fbfea130fe328a58bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "be7cdc203f150efd4e77c64f761bcb074695de599f6b10e454719aaa65ee7f83"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e3a55312b9353bc71bf732e29ad5087d8ed8975cf369551d3c70bc57da6cca4f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3369a61f431a5ac7edf17b40b9a63622d2946eb06ed12d6d443a9d3e31cb0ec2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5d8bd5fedef7ae87dda33d570e5cbb3d59e2bb11e84de88f2a941c9e3e9ae933"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "186c14349c8a8efa63c591db70ec522ce66b2f30fa45781e37d14b5dbf535d47"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "92b56e582f31ad770e010725eb161b19486d1c7165d92414c45b985b9ca96a69"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c09a116fe80ef73fd0ad6cb412e661c3e39196625e5a32cae22b036496391441"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "607774edb90d222771a19714b3b8d101007b8aee00f355ee6820c120fa569a26"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37805dd39b1df9bff073124f416d518f0df57bcedfd04aa701192097a50f5555"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "8293d1fe39ac262cc8e34e99df1f7d468478027d859f5f691b863460edeeb2d3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "89fbca08dbe820d5acdecca18fdb52143e9d0c7e3478fc037e1f800557a92605"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a52c689ed0458be30ba4a68d4e02c9481689cc0208b707d548a15372c1a3e4f3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "419e1b13c1f08ea986fdc546f83031149e2a50a373793638bd490bb1908a3609"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "59b362638bbc90185967f7d6537f23fb4d257813afaf20d90e855e7da1f7559d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e8f257cbd7863ec03d3f9226ce12d4d01f834a1662dbc954a8c22db837a98232"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "cfe5019213317d53ec42dc7cc03f2d93e72656e9141077147d4469472d3a221b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "19af44abffef5ec6b4839dae1684d1c5c5742bcba5e3d62a9c4e0a2c7ad00723"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "2d0d640ba1d0e38671887bfdefa1ddf264f6880008196433c96d42d44efe5812"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "d7574783449e867226a9442023a8da03c248256b400044d6103b7e494cc4dc43"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "63012e9b92fb99db14476e7382b0f6cdee3a484e11c709c8d62422ac3f382e32"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "b28ef8c988f8c0af3772d9ef94a1fa7861daab353ee064aa971ed49f8ca4fa35"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "0db7a0b31405f62bce91245b2de8559d287ad363db8c2ee343fbfa7a91ae1683"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "45e3f62ebd99bf5747d79ba8d30e88e84f7f057234f765844842a15f0b2900d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "0b1d7f9a85d682a7299f688c7a0f21d3b29fdd32b5f7e81301d8bcf806282c04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "679e0341d1d9519fa9f97819382d3f422cd8edea2969080989684f2c4bcf4bcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "791ec246f4c9f2cb96083ef1c0e3996d940131a7a8850eca69d4b68238b85ed2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "a69482916968d71e53c18ab8ea7b466a4a2be2600796e1d7278c818ca3069342"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "a9f66435fd10257b2662bbafee3cbe6b3ff5ac57ffafe5540a37d075ee961cf5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "6c5b6025025c058a5f7ba85d68fde847f3c7c0aa74cd1088e4eb4f085390b7af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "45088f165dd7da6dbb8754163ac8a39b530f80a4978f57afd956a9ddc3389c99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "6a5a2c994970f543f507a57cb8d5e18538bda75aac0f314e3ea916e17c6de8cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6d2ac4e9ceff4efa9e3dfb2d08eb09b71fdbda3fefb7943a9b6120a551fa0671"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6da819a821010bab54b74d2c9fa0c30a34c4c78e5d2b1085c86a085ad3da98c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ef0cf0b6ded391bee27dc8b67f8b232d468355f07248d7e744fe0033772ef701"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "ed4c967501fbc3f703b68f41ac3789f7eb074fea500b2822ecae9b2991bfb75a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "92a2e13b346bf3d169a63b6a44b171d66f8ee833b34d3d1fb02acdcb7f5e8217"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "eb23cd13c3d9f2dee160a9869ebb54f305e31ae9a6fa769a6d465ede6b11a242"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b8b7442cf9ef49b04bceadf04d8bd313aa0f4bc3b9890570a763554688c50576"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "9c1b02a0c01d20f3e86ad7e2964a79f3c355f3ce9f7b28f68b9b2e1f76539090"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "94aab56fbb3b6fa6d44badc8588c12b490b17fc92a2dd82ff8282959026ea98f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "96207312f06d5274bc13016912cd72049609fced10749c56e0acd6562e78fc42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b8e7091bd594c9a8d6b70f1c8e26f533865d8421819a6fd6e672548c65f600fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8e21dd8016c054d252151fb1001dd49745e79691df818ad0eda92bfacb4fc9d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a93005cb923bb4e04521a27ea8d40205c6e5a11c8c3ae4954d9173059e57e10a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "e85abc2c1f8f0cf1a1b8a089609a7b8d5788f3d380c386e38e55f808534e6508"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c94c3acb33e329c726ed4224b64a334654f8076f7cacd3f880472c2576b960b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "dafbafd69aaa26f80e5696a86d7776f3aee5eb8d9b5fe2f64109481a1d332e49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "10c22be7cb437d035f005c7ee2b72e7785f4b2bb151dffe28f8460a293e566cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "8049e35c2aada9352f51c95dd1edfee247867ff44718f4ce5e51f4616ca8ed55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2972c3809fbf5bea52ec38650ad5e2f48b96ebe824334ad0c09df2c504a614fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c380380451916e084d3baeed835e7f631f46d486a393a02b9a2b4c9181f859b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "3891cb46d428858d054c26ffd7d5bd2cc5e63736fea352a9114850f228a734cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "70718c527dc31b0b9f678b9e9eb90bf383a3118a5a84067eb7ec2e326417c247"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "ed83b334f70aebd5ad35ee0c5ab221d1bfc2da6cba36188271ae55ce5d054cc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "a4ca60c62558a8959874324f8023ac3fdab8a5be490c3aa29ce0be450c7624f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "60842f964782723411b2eca3b2c26ca7a04c56c4ec11621710bae3ce7233ef61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d9b806200ab4d9078ce6f123c8084da1fca2ae3a8b98193e01c1c0ff5c55c896"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "483248c4f96c6fcda383790ab400a7bc37f1773003bcb1f452ce1d96738ce5d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7060b16b7c8555ee33c995ef62c5470ee38283ac58f9fbd3a2b314b294d4cf4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "30b5aaaffd166e8d7d59ebf482f05d28ddf82d296fde43c67863b1f04277d604"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "f485db72dd7d2bbf272a933bc4b52faaac2cb53c1fa9940f4de51423fa063b76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9814d44f467906899203a0aa053815e2b8070d803fc9fe49e59e24784db187c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "3bfdfe5d604ed754c7680a24fb7c7f9db732704d3a1ce5669c0d710c2df678c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ebfa92252f7e83b6abcf2121eeb4f4f3eb335d046eb23416ab63cdf6478cbddf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "033ac854184436eaad6641c88e8cffc3e04ac011fb97027e0496296cd8d88b95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "c164014691b39a4783a1cccee3a1fa89b993746f96f7fde72bdf354c5ff695ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e608c5716ccf989b11ec3b89f7025aee361eb7de9322b2b99c81631418631dbb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "469b8150da4225d99cce2dbc6cff5a9e91818036a1d1a7d5825df22b45abc61a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4d554cc886a0de1f5f032b5f4caf0b576a7db082f1c2c13cd10ed9935a1a6e6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8c2703534eee23479ae20a9ff28a98e82c0187a357f47fabfc5b6f15c53811d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "84af9fdf3ad585b1293b5b9a74b7014145163614770b3fe82f460b13ccce7798"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "663710cfa6c4daff8411f98fbf9efeee57b7a6a50bdac5997358f3b25e29a88b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "889076978f0d7958d253085f5ac24b4727872123cd12ed1541e0a2e8a25bf597"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "35973ec2dcd95a169ae83577d1e522af05b227681def37bc17b2e7d7a9539a8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "740e9b6b04de19ca34ca200fd2a109f1615b5f5e13192023cc5c539ad5487ba6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5a431138fd84df76bccf737f02ab4a159e17dbfbfbc8dfc552a6878de3d84060"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "1037e1a1d8514c0e274e40643fdb7311cb12f9652099bde3817cb10f029737d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "9786c6e6b5b3f6c28bc07de4b8f465ff7c59ba967b2228e1460414d51a28b2e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bc76aa97480fd1a6892fd63c68918397bb4e4f596cfdc1f58c731abd118df2ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "4a9a39526eb8685ede202c8a9ceb523ec88b22847ae1c94fbe6b004dfb72c168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "bd1fd684f83d340fefa2b748a09b8e3459553a930a3c02e946f353d279453253"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "362c8e568c39c13330d7197c657f052e8dc32de8f748213e4632468e950171ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "0ee31e56d23a199beed1bc19b8c5db7d4278ac80d0f065fffa15d4b7129bd815"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f99ade13245bb535a74ced8a76126fc86be07661fd5a5380a68d481f0653d22a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7f3b15406f4a82b601b1a45ebfb5dc509e0a158300a2a94437e8062e7d5162a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "09abf7e2a0800d4e7c13eb39c5cc43c60e5b5fdb719bce95b66b3322edb7695d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "50a0e1f5e798f12d29271bfb23fecfb26ca73e677936896fc313f622f98fac71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6c30fbf62434028e01d1cd2065d6dfdb605ca092f5c66e313a5f1761a869f76f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "51a7e1afc413823659abceeddcd0cd1ee0b59f1e4de76b0ed36942b60a078db7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "4be81fa7d9bc9001764f0e5843dce222c9ed015c3d17882a6c0ddd26964c447c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6487e9971b95cc6971051f5a38ead46bc6964fd308054ea728a3292165aa0adb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "5222dadeb1dd875d36096b430ad7e9faffbc27d7438d087847764a6b1d3e488c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "bc518983d66c40d09555138c2d7ab94e4e776d955a9351b327fb34b74736a512"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "3381f262ad759a8f4bb2f6aa4266ad0fbe79728d29b7ae5687d794e647955860"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b80f67c6e2619a492324da3a67a43832d3208bc57625900e6c19868b631bf194"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "056dcb2edf3bc957f3d60931b6d4c8128b1ea8dc5fe031ac7bda5444f8895d55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "379e88bab26b9b92344849226eed6cec60c4dcfa7d5ca4aa779987076d1abb02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "f260fe162b27f9a4089d56e8ad7d1457c71b9777624b02ac98451e7c519fd36e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "dfcb9311ef52819a03f376f42b3b26139a5c2e4ac100de1fdde0622737b96675"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "39200b6e5b7d421af1c9ddbb37b133bc3871f1f5107ebb2b2ebcd9699b98fec3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "28c14b3d0a1866fe1e3552486686e8c95e8d0db9c5e5439466befaa25205caf9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "14b4cf09cfdb49af24fe249b6c95120b763a4350c94542bb4584c9ff54bd27c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "a713b5b9662b1bf9bc243a505458fc9aea191629282046ce27d2f06c97ea6a98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "252bb7fae1f837f6b14b6369446a69edd38ba32ee0904a49c17615517f7cb575"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c5f6306d9947b892e03760f39df2e7ca970bd1661185c3745b8c7337125a8003"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "70d70dd1638bb7f130aab07a8e526d8138e67ac05e42ddb27b91e888c767e68f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "818ab29e9823dfbd646cfe48753655aaa3e3a22fbb9a091d5e76893623c62422"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "cf2e84200550294fa243b2508b0874b0f99b44dc5144f1c6e918ea9b69d2af51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "624f7d1a143a180dcb8d60f1364900ab3289d8e407608c7eb43167acebbc3c16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a017b86c55b023d8cd314c9a3e83b8d15a39c2c44c6b920d88f4d642a5545306"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7b807d87445d37280b639249d41051f06e9568a3b936e44c67dabcee83c737bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "0dd1f5e6154589e91afc395b8b70e47025055a347caebfc6680beb4a8d214256"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b62ccbbdab6138e2441706ac47c870645ab360127a9bb0e5326c5b810920b3c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "c6fc8ed0e77caaad1775c038401b8633eb63f549e3d749a11c4b84f5dd804b41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8e5bb822c6f8ea5b93296927b6e60fbfcfc9decefca9ffd3dc6ddc50cf2745e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b3678b75a8fa9adf02e077ddcae0e9f1495996b1ac4cdcb0d40da12c5f4e7d35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "2680fcd02b7d098fb0a69fd2eaa8d9663022955fb5bea32c05052fb4f552707d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "208feba37e5f6d3ec7e7c037d4bff89cc4590366b3e05b293421c408348c3b81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "9aa55af6dd8f6d36efbdf134b13ac498127b77b2db7c716dc6ccc0aa7ecd363c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "beaf19bb61e9e4eb080f480a7b35e2f08e1bad539e96f829419573b6f1f35498"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0bf79a069fc5ff89b190a71ee9989a42ebe20bda9e1f323efc1a09dccb1df190"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "1c16324f20ceba7f1449c590190abe4f52ec73b855e4fc990311b277fe1b04b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7f6ea237c699d0868444b8359e5f5ea82a85cd4fda8d3cadbf19460aba424bec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "74601b0e9125ac5fb6b9320e2750f3787b4969cc311165227ab37d3967218aef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "71ec7d80f91ebf19784d862014659386bdd8c1786b6869e3f54b30ba6feee19f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a66cf9d61bc2181027ef77a71bcc63b20576547164d1c5085b366395cf132758"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "24ad1ec05ccefb32da8d6c3b3503ae97f24cd55a98c2b652335de78ed3270b07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "cdda0b1f57aba8b4428a405a3ffea99f2f031154733466e15c059507badeb49a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9371d30c4998aa2101b8527a7c2c37e1588203c492b490beafaec012a93bd1e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "cabecd528ce6b64b506b995dca803aeecb1492144087a79533e752ff25256f9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "997e09b74d03eb69a0431f97c933ce3b90d93177879645eec916b042d2343b17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "3b4e6ebe1a274258b98ab625410a15e00c53f87e68106303a3257f3fccd834f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "84852e0b41e100019cfbbdee40cdc98896c023a1bd3a85638b91162cc9436821"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bf32c1da2a613a2085d6b2eb7ac1437d23c19751f2ff0813ec8bd684f106ae0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e3d49271984ad58f7eb665bc4fd7d263e76403924bd8b935476939cdffabae97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a6d400357385c9ae930774e83960ab472d9aab61eb62463762b117e2ee4b734d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b321104345580e2f68c9d89c5f287b62587f63fffd23b83bdfcd3b41b8337b18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "55e7b9e270d1181168c681270551489888aa3d516e2be0b99b080a809f392ab3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "399771766a40b7c56358904373140c553e9181da8dfabd10bbe37dca847eb73f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "91f0d8e445c27084abace0499e692170d88100979b48b6c85b93852f669d0ef7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "727ea61ac6f2f2b89063143c395c40d38b613e59751f36f4f490b51aa44fe0ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1bcd5dc1b7508b605645fe06b0e47b668787f776711afe183a3519e47c244706"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e297d2096147cddec94b8487a07c1ced713306078a5bb0733f1385e44e556e56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "a3a80ddf42bec666b65093914a5d68cba4ccb769f0700ca92b1ede80b143ed8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "5511ed4190c3a3c0e8785f3689775af9e6ad244ac42135aba3121fcff6839c13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "3d3843e33e34b61f04f03e1ee001bc38976d016cc39125434d0f897428d1f6cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e2d5bcdbddf0cc8f01e5c13694260ba2323bd4b47bf544c60b6ea1e6a8f37f76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "14f6c3b325f2a206b4410e3f012257639085820d1d29af4c13bdf66b384a864c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "80b2e0b989541f3114c83df31f1ef8550b06d7f6745828bae688b0e99be83e00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "f7e08c9287d59a8e5bbad5f928f949f4594918d0031f9424b823b131cf8699b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "7a0a273578ba0bfd838f45227bac8e0b679694bfd18c7bdc1e44c42e5ce7a8b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c4d63ef504f2dbdb52edcf5494ebc9db1b655a6dd5ffedacefe96e1c5fbab9ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "350e50782d8e8814fcb0c491e34cb70ccf2969bda04e3b6d130c0a934d4f10e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e0885f6586216c676c4288e8259dd4a99f2cee588ae5713bc8e6b39884e6fec0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "591d38fcd1b2b634f18542ba65f3ac089c0fab279bef2f20273b71d49ca7258d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9b82838924824b6a6dcb9f4c0187e44e350027d2bdb8c233cb303f5562707544"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "62b8de990d939f7cdbfda500b9d74b2c7b869512ed3eb34ebfa99f24c14a9f3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "f18bc56e2261ceb99e561c5ad432fdd1800381f2627da6ab1afc27de784c5b33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "748f78ad79681c3bc496ec73222302eade6d023427ac7c4d27018f780bc1ed83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "a9bbf4e9793064a992ee29ed6f2f886d54913f1f5d01f6c4363fb10c74765b85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "60661fde361b071da0dc05e79f445b65ca2b15b810b4ff55479e318081843c05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2007775ab259cbce784e4fecfb59cf0fb5a8138aa649231e35800a24fbc682c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9b2c87b4d61399996f2a7db58e20b0fb63eba3cc725a3be089007ce4a4530222"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d9992bf4c427aa85228bd266015cd033381d2bced6988ece769acdfb95446520"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "abd03a5fa18941e2896f04d3ee6037144e57ad69bae5171e3202236dc054d79e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5c87bde389ed13a86f36907cc6fecc914c178e38903f3a524ab5ad261beee6f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ad8718637f01e5d23f33c945c7949cdcc6bccb733534e06f06af0ff0795db7be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "aacd296d84d85cdafaf577bf6f293b99a405f02893170e921a47f295075dce14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37c337768240b556536f9e97e379f5a36d953950aa14701439aee68998bab1cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "8cb74d4a335625246d28dc34e03bc1ed7631abe6b10b6f20d040075e71cb6af0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf8f8b4f7c77112d863116aa7295c1743267485294d260e445b06f3ca42bbdb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "14d914a396dcaf04c287ca309465f756126025dbd5a039ec622aed97ef90d94e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "47069adf5bfa4a217e6dfa6d02ba15d698ea59ca14a263a2fdbd98302606dfb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b61e531a79642bbebde76d9aa8fa0dad245f9c0dd96b981a7002d9292ebfa172"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6a99285eeb82c9636ee2f67ad310c1c443aeec7b03dd36769e362e3d90eef953"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "f469a9cb02dc11c63cb0f2606393b24222375e1e534b3dbefaf44c55b50ec618"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "6d0b982f03c28cb28fc8b6d4b7d203b540af3a4903ca00c2b69f1e92968fbef4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "1b2d130d219be85b72f3dd07d301082ad30262514ea4b651722a1a96ae9aaaa2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "a74f99612ad6a70690de284fe168a14c4d69071bace3b032f2078881e9c4c3fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "3e7a2018fb0a4d372f2e35735b593385d7c75e6fefdeb48b264c313e6f250b18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "9004538c7940b8b5de7594536d41570967dbfa1e34524267ab08da8d8cc87bae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "180fe05fe1f34502b9ffdbcef54c05030cb24fdbac618cbf07683f49212c83a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "34656e9f5041c64e33682191d34d41f707da12a226ee6969f270c78ed4cced37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "102880e8ffc8dbdf06154c7126a3d62af79e7ab29c9659a3aafd43121c79ddf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c513a84b1fb8980bc52ae449390303fc24faeb3f1227d111af5f8179477cabab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "36a11ef51ca3fa20b985adbc52a8409eb5366354195f57ea614030cfe5c6ec4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "2cbb6fd39bd273b3d59ccdf83aa5e458f3a716c21d542dc2b92c4c598e58b551"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ecca3ad8cea0e6613ab9e56a71deeacf82fccd1ba71ede944e9c90f89ee30098"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3990283b30a82a472d8f48661beeedb53bce1707dc820e8b4b0d37d99dda285b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "704886e1034acb7a1bf6e6e6f3a1c875a8f0f6771fc81da48cefe9121a6a70bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "6b0f0a475d2c3b1f9c05c36dcaea84894e4017c3cb97f4336045e21b2a6b4f2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "025f2920fc73e0c3d6bc9c18d19135460e2b986018eaf8b2351d315ed986f5b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "db1cc0dfafc81a9cd53e895942b38b6e3530197dba4b655942ed7a53277ff9b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "90720fae845124a30ca5b1da681d17adf0ad79cf7965a77f987c3da7e6e0caa8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bc2fd52a162ab8f050ead4b356220c4f05adf012509ac8f9c50572eba8c29109"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0363e2dd68dede456cb11f5dbc38269c6539d9b5e2682410b89e64a2678b74c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2b478190d2602521df9e18f62ffece5069bb6d511c5b4b4d53cc107a684b4ad9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6d2169ec4efa89b61e325e82c7b530720fe156ea182e91c6c23bd42e594d35f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "87b62f27db385564fffbf9b28679773d1b67d4348e951393a2262893e191dc1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "f8ab1272806279f8c3824265c2efa17fe905bc986272a2284deab701e5470a5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "faab8c2366e44835ed9a7e023c758058bb15d091f63cc0455da3fc25d18e6d90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "ccd7064deb3e49717b68c4bf815c4f66aec8c695c2bc26dd44e59f3cdd558c92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "cc7b0a04eaea7966f75ea42e2cd8a1f09ddc674955e5f803efbadd9b3e3576f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0dffce29070e0fd9876cc345674c0da88ad0905c55ca29b07cdad1ec868e9c67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6ec425c5ea7742e298ad0e1abe479e00c5b09ee218a40f15ca0bc46d20ae1949"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "73573bcdad336aa0108f38f64ccca95fb92fa6274b7f4e1ce8077039c9f5715a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ba973227f394ed2599f09f4045b4e4f62bfd1cf7aed146db33afab652553a2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "aae7f567541a83df1afe82078059647131fdcdc36f9119f0ae22c17e50be4a94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "34a2f1a10a42cf20e4a839c8e76bb1cd02a7a4da237e87f43def098f6160624d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "e2667316a5e9be2aa8e71009a1ea88d71c7dc283f5fe4a42d7e965b6f82c6086"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "3c4fa7755ee44507c24f834739110db30511fae833bbbd2ece41b90239a25611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "24afface0243a7504e51c6b3cba1b161a5c0d1c35509687f45b38991298af292"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "421b1431d59f540cfec8add01108ebf58a5d3e01e72411fe9b29b17729169118"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "fd59c22dafadd192bbe34186dadc76e6ea43b09c7a67bcf924fe62b6928df0fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "cbd34eef43e6e8b0bba9da7941a14526228373962db795a8d0ff7c3000039234"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5a34299db43a0e2b7c038ae7d53dc41ed572b670785b3a920c925a476fb0bb2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "285085b468be94cd0a4023a62739fd332fa91d7f26a540b88ba2f24323fb9502"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "62ce4f09d00471512c35155ff66ee7a3b59362d47f9b8a1d7ebbb7f9ebba86f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5d007cbb0b80970a1a1efc9a2f6373b9f989fe64b9097692252f5f1c0aec0b7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "89d5fad50b916ee742bbfb2c3212cd0cdf382e948a95a91695d9b7b25b5ad2c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "bb94477b563539e1a92478b14d6b0e2b3972282a33994bb7dee9eb84959fb4cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "93394338457a8eeb86c2dd8008e53bf3351cfa50dc540d566db81deddf627a2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "256fab4370d4b423cca4721a6b39da585333a7de4e5ee86009be032042aa2ebe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "10dce84e994ce68ba7c3fd21d4c2042ecd95c88288124dacd9cc3ac6d3563f14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "3b21cb9aa7f6523579258ae1b854dcfb2ecd66099807f29aeb1472b64e3b4a0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "58e30ab91f4c898b269500f6996daacd25f6e0481f146116e0419736a5064515"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "0eda34613947016a7b5fb446d0ba256fa094c50e8e3e60d4ef71cfe0cdf4ec9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d6ba52362a0c78625df5f12062df057df4f93c77bd1dd24c6131e7ccb2c16638"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "fba2dff1f736cecd722386b7a247c45db29cbdf28582dd49573cf35fb3732631"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "bf1d57dd9e37d59c0374edeced66f0bd8b75bb8f92cb60c1d6bbcc55b16eb984"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6294f5e8f0de1451a933aa6bc1693b2bda88b4b1543fc846bb40d3c56ba7079b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "a554ba5b462c12a22f10c927b0ce48db3e9f4ca5dbcb32e53ebcb0c45fd77689"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "e86ec2d9dfdc8f00233bb2ff77280ceaab4e8ffd8a0076202aab5828e66e78b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "ad2d5a93663b185a2644e46021e7f84f8a3912b6eaced6b12b0bff8e64d8652e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "d309328dbf25e481bed1ab1128b45126521fba73cb27319baa3ce5efe309cdd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "bd2219ad8cbe97ff716011fff2dd94b3bec12afd3c8e60e81563189145a2425b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "fb204ccce2d130475c836039b600e6e99eae5605d498fc092898151fe6677224"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "8ec7e8ed8ecfd55ba0421d05a13b5efe75409793d03179ebacce1ed0905108e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "283a1737c510ce6cc29a3b42290930457e624eec770def11b4965297daaf754f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "35fdf8a25c2c860519dbeb1d910902feb07a1fe4a28b39a578fee6c43e393c06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "d2a124ec84914b1234b95b77606ce4f1db55bf1db9a1c86541bcaf1bd30b0d52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "646744aa15216c59d45c87a1806913c262a376f534d0df191b3bfd14bc236c8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "3aabfce7fb76c176a45d06f040155071982dc8cbcfa865b6ff1ad15700f7bb77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "21fde8895def085023e204a3c0c21c0c1c109fa358df90f3c99f2d759a622a9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8473b17d145e9076d949e04e8c93e70c702649df532f524919b7bcf593ad2955"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "742193bf66b96ee2ee4e7ed601ab8b546070203d693a661146dcf1c5a0d59931"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f8cec05977a599181807cb796d1176f602e47474961404a87fb997df9f48121e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5897cf27e6098c85362891c857fcb71f28a9b4fa98d9f5547b4c0b8ba2e0d8d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "ccaceca6ef23367029d9a5b106e5998abaf358069eeb9a3dbaa0b97825626d92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a51ae85aa938b9b2aa00b230a4a849086b8e03769f6f8890d1802c03ea407503"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "559d99441022998a6bdefa7e4e0142fbf87c5338bb3befef68e656faefc9eab3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "66a098485933b74ac1f2e60ea8423155b5b8a9cff22ab482e67a618aa2764a37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7141421d4186ecd7349e2478f9e0b8f369686705605334ceb8a67c4cff295454"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "eb974e8bbe6fbcc992fbbfb84ba35508b0d6ec81a43b44533ab7e96a0f7088ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "61a151032c5b064a6242674f33689dba5882acda2c8323bb5121b620846b37cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "78002986299cc3aa74c2d984f8b155d5865e6688965b9f62d13ba8f872dba3bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4eaabef3608585c9251e85a52b16560004ecb9b7964dbdc91144fa03eeab5812"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "1637b00195e7c54b88475f3157a20e08857a65176cbf37de963937c9cb8d8d2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "c7adc59e4aaf860c559d369b1416ad133ddc877ef13d9cddda922c2513777a58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "6e74c4b18aff9beee6cdabb32ce78843d4fea59aa5e462d562fae71a7bc22026"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "871515ac349145e4d0e777dbb917db678d3fdb537113046a59ca81ea8d541ec6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3e7aa448e43c96002b9d046e4892b5497fef339d4e8013bd30fd42f26e5b179a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1b14a04087ee62738cf875994bdd53656bd7741b77f92e8487f9266b912efc01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "43d698038dc4733e3f37bca3da386341acb54dfced839656d2ed8bb49ad468d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "68a700e8b701eb2b8ad236c10c5edeb36063d2b0d0d4038743a1915c03130dd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "6ff0eec07ba6f936c23598e7da732bbcf5eedc972a2cc0de185b65b6cd331691"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "5b170e111893c0b502ae7c30d20026a0f256eee149445528eb0ba5f1b2b2c2dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "dad91915ec68ff36410d51e47e22f581da2686f3993e9531d37190705ae724d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "bf3562169a8c67bcd6c783ba699f0284ea53221816bdf7335c646b1e3bacd04b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7f0919113805626e5c852a03f89707d741263f661d375cb3b86bf06abbfe3af2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0e647fdcaf95dc45fffdd22cb15b57faf6b9db0105abba7c0e30e33bc2e0b6ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ff203cfaf23a323cec265d17a6c78e9461e5a805f550c70c73de2dd2240e1ce9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "698ab5489a4e36d4b15ef240032f0d431b4ac21db5399c76c9f191711ce4094f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "fa838970cb5aa9483a7da685ed099b5eeaaf586d7c5166edbfe8f912cfe05823"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "874e1983fcdf2f092ad33cf36ef736dcb25f15d5d997856bc2f049886414b92e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d7e735ce7ff9f479672d7ca7e745906d16ebfe713e0754c3ce531bb2898e2d67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "79ff0f9f2238b4f35c79fde0da2c8ec7d33f50c2d739293a38e81b2a7c8595e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "b993e2ad6740efd6d21bcbaae1aef9480805653d6c93ba87fbb4fc1a9216e68a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "d034f5e143089d616fd5df03190d220614fb756f0845495a53faf430410338b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "75f9ed11769f9b0a5a3b64aa5ee094d0c07a805064d988e8b7f9de393dbe0f5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "0bd0510495cdd7492fa375885719d79f58ea186cc0062420044eefb332ad541d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8684504ca4d620ac2221cadd57ccc9eb6af88128c3292f476b472b31b4556ed4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "991e1d8af110bbc2ffef4f4b7958133c981ea54c10d03ab7c0a60dcecc51654c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "c15e7bb4c24b29842accd04f1c7cb6c4e8d30a0613cd2223c3ac1a6218114bbb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "aa647d0ea9d0018b536f7585b7d4392ed81d0db39cbc8520c25e39e9c05ad06e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "bc57bbf70fbc00e87e60b601fbc635615d3246800fee0b9d2a21a84a25171424"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "cb243ec7a56b5d633092b18154bf05d69d6d21fbbe50c7a70388ddce7b2815ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "a2acc15ea3ac8c3596216be7ab6cb722af6abf8ee61e3de1d9ef82b0b7391333"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "ba4da4192e637f13a6f45a1a9470d9b8667eb2f3c38849b1ed4fe5240437d928"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "e811972843c62389912f52f58e9fe5659f6770f8ffbac19492c741722b127957"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "5cf52c2a930ff311f04ef481ccbc246d65a3884f7c0cde125b47dfe2ee8f1ecd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "3d44603117de22c60ae824cac3e77523783ccfc5b37f18deaad9802a54fc5fab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ef96610224e4e0ba0caf0a043e3eaa09a1962edbdf587c35f5186b26837a60bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "033991eee2d3390011e6dc6efc4bcc558c8cc6bb87aabfb228c176974701d833"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "b89c87380878453cc1d53828f85a1e08057e350311e465e37145134755d57824"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f3f5e43eeeb4c3485ef263fff852d6ec42f9872d6a7b75c57cb006722317b1df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1b4470eb39ce1f11eaf9775ddf3cc3f2ae2c6105e5145162f6bc94b0654321b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "7c95cb447519b161440688bb7e4c3c6b1e090064d539b55cbe20cad03761f72a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "44121d975999a7b988c31b983fac28ca55b266564e0a140414078a2266d44acb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "379088e6f97260fec3a96f337695774b422af1373e808227a7d4728cd3aa14e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "29eb9521cabcf7c7a8d1d8626845f111c435f937ae2c7b852713578d28bd79db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "13cf0628c1c5e48be7c8bd908dab2f9f3dca95e55e97a2105c7c01e254c100dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b5abe51f10c5e7e70b6985ddc8a06942e7843f7a8b246f5728895d8fff6b19e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e15f261bd31f8acc7c27896ca91a16de35b74beab51ed5a47c4bb2ba9c07c16e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6fe4e81d059826a30232e38373484ef6ef211e29676e3f7307afb58c7de34a57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "16983a6e1731869c108a42ba02f1cf348dfd97cda11b7b79ee140086b9de8652"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "386fce7a39f80bfacef951754d16362ba38c285ad6880c79e236f8e49bba0ca7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "05b8b1f0e36b6e6956d8f3a5dc608a1e052d8b8ce4979304d0e3f1e16f0aa4c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9e799ce5ad6976d6c25a03d1acaac22042e87686b3b491c48ee933330cc6ff39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "f7a51f270c3825f5230a11fba2a7ab3d972a9ed057f06d5e2895a013997388d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7131c86e9e8156fb1fc2a8e9b659af929ff80325ae52ad154dd1ad62b4f46829"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "64b7dc8f208ff2eb356581576f169346e2c7ebb33d997baf1609124717fc0497"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "38d8f50c5ed10cda98bf4be417356a31afd284a9be79393a4a60d1e92dc1f27c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "405937c8d404c40f2adf80e6284d43bbe052246a033ce37e6fb93793f651a671"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e83870ada70bdc167875f5a23947220e7e32cdcab5171a56dfcb665b81620a65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "873b0bffabe09aa26518c9cec854ac3a995d42073afc27e8c18301538db87ffc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a3bb461cf85470314ae7c9191b710a9c8e29fbae7d3020d065224a4a9b16037f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bca104bea2666f728561fcc7f5e338e6ab2ea84266db27fbc5b3f07089dc45f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "40c53fb68f44ad63274a6658333c51b769e58984e50746aef806d1fc8f70a798"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a0384d0b30e2d4d2ee681e2d17e8ba3bc533161e6eccfb65c8a4039d43d1b25e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "9897f13fe503c4f2cc756a2567b01eac6a250849d4e323730bd4bf25d2b12fa0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "dbda8abc16add89a529899c1288b53b6228d764a771ca962535fa10bc577dec6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "f332b80f67037f424488c757df0d5f09b40ea6a783e3c816411dadb11ca1f9db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "d3d3697adfd6f4df91d23217a7bea61c6d899d03cb2cba588a54784eeb8c58bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e03bf754384198b375bcc3c9b08a26c03a4b4f3e3fd5dbcc29ddfb7d5239b54e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "49ee687ff0cec6e70eebc2db059fec37f0bbcbc019d7382cf5b066ec97bf56ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "51fc6959603b2b5583f97a283c05faab029a9f2669401bed2e8b7de265c8de87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a5658a35309e387fc60eea5b2539a09c109d7e747645e4640c1451bb2ab548ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "0016c9cd72595f181cd9a5cc5c47bfa97710000a7570df5c5288634ed5005c88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a318ee99097f634c17f71307384138f438076569f5cb5eeff61002b407d97d7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bfb5c77592c9d09da4ef06f24bcdefa1ae1c934bab9d7c05006c47ae73054227"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "25202b0466f5bf7a76ca643ecd11677a08bde68aa116ff57fd261fe4b0e916a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6514510bf02c68c1c115fc92467774c7f743928dc871330ac97697c3a9722c0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b7dd0f9c8a0f63e343bc189e91eea025a25d1df687b5376fe178730959854855"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "86fff18f6c2ff726d1815301510676998d7e739ed60135b6d4ebc47a1ed72256"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b328aea101333e14a6405fa2efaf1a70c53fcb43757c681dc376e1b229602677"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "eeeebac961dcdd7312ed0d580c679159a378f03640a484ec6f4eabbc51346ec5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f627b56e44a3364e214b095f1895c9b932a353c71e5f943dcbcc4a048346b149"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "026bc78d9326f819e71292a20f6e53d13e6a1c14c2116c6a28d45e15d538bb09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e43ec1fed2d8816d9198d8c32d41706ecce7ef5224bcee97521a27f295404954"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4d3c982e732bb3bb1421345bcdd78259fb0b4b8536925c5cdf2f41ce24223eae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6f6244515c4df0e49428126ccf2d3a8d56625b8cd01f9190fc0ada5cc26b8b6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d3ab789a1af9f5dbb7b34a1d4a579304d0e1282619dd39f113e1c9fd9d4ed82d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "3e4937539d1d3d8915a5816a1fce691ffd50f135c48efd2946034abdcdd5f2a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fd839c592d40f79aa529b428ff6b1b0d9353079099a694473cfa1778d567d5fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1619455fb0e25b6fef296d052b6aa108e0cf241e1695ba1885f5545e1dfffe30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "a392a782b9464256538ffa276228bed6ee3c2b130109e529a98b83c90c4557b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "4c208af8107a8b3c41942da280edbdb291fa7e7a9654f96bee843647638f520d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "5ca83ace4d875ca7c259258686880d30670c4e8a4833d380d819af44247b0ae1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "120a561ee327ee1727e4048d993ba5e3d9db1393008a9e251665ac33d9d9c90f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7f63cbd56d51e752965a2c2095c21a537bccff5944b8743f7335b671eecab62f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a31474378a2219198112883296532c3e10287b75efa32c9a5de515631623210a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "e1fab11469a047e893ea498262eb4bb562bed52d7152892b7d140d3f87d74fbf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "73981e441fdecbba64308f33b742aadb20ac25d8852ff07621015ff6bedc6d58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "dfafefbedba16e0b1e74461d02e7653229116f82a306cd7a97aa394d5ab68317"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "47070329a3c0de450ac9dcb5c9bffb646e6ef24cb5cb6a480fbad0574ddd12af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "aff2e1ad6bf33423a8102f52863a956476f5fb4f54341638a5b55849abebaba0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "79e50a4e3c705c4a27d905a1c971fb6ae8ad4d638fafa2e0562e845077f05320"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e148157ad6d78074aad1e1da8fa387bb3caf70b25bba7777885a63a3824315f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "72699c65ccbb589909d372672ec1d5e10e40ea37d0478ac198fdd17e3e01533a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "7c98d2698e00ccfd971a75576d327c36f3b62c2188bdef8d6d219208312d18c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "57cac8b138a8feeb413ab5cd0017eceb272cac59978b91b24d22261330686832"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "75c5beeb06e206256710be29b5f6117ba7fa18268a5cc63208ebc09862d17b0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4bac9bd2f8091eef66a3d91ae3aa076fbda1e608988ef74576efd80fdc251127"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "611fb760d5abfd07c56456c3bdcf772588a6f88bd386908f6ecd78716f0eaa13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88fb2a939b34431b20e27eb5b2ae465a6c74411f96d46ababfb8c4873e9a4659"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "390899e4244da40f05e35b9a1102cda53a8c761402d9b20df3197c5d7ab8b173"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "95735104c66050aefd80addacab75bb9b5be482ccb4222703438e795c68f9b58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c8b9e5dd64fe2931e71029b9c3d67189d0cffcf812a042afff9becc11fae66df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "088374f3f3c56a117b30ff5a78a53adcd13c2285907a9776eb25e1a4c585d873"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d84d7befada84b5330ab278aa33a8e2195e884295e1834a053471dba9ae001c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1bc174fa5e2ff66fb32b790fb02694b7d6fd86bdee30593fa2f0cab341e67eeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "d576a47ba99fff767ea3bf801a11240f6b60d0ba3da6daaa0b5d76bcb7c2be88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "d3e562517d1cc496b109bd7a14d288a61b2874bd7fcdb1dbabd056fc0ba7db1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "401152eee59e0a7bc3b7010cc5dd91969e3495c33e5036d89f0a77f0a93d3145"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dcca2f6150e0a2b12307f93a5f60a3a1cb974e38ef3d924b82931ae951c23267"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e3cb12e4953306969f835bc26eea41b2ef8b6c4f56aa9d6785b1c45ac7320553"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "93804ef17f44e27d7217fbda0eb499479899703710069c4f1fdd3698a15a31e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "8f53f9446214e25482a96b0ad959f5ffedb74d70644f0cb63a6d5cb275ecb1b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9514cd6dd5788f531f89c2cf85e5c06ab1449960458d39b21c2ea22375740a19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "aa344766534c0b004c4a6ccdde8e91fdfb6845f2916c829f0c04450809ad6394"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0d003513ee85f2eed048e523de982e8ca80006a7f807fd7106e0d6450dd070a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "99e1dc2aa93bf7c2198e65c2234b20469bc5d3b8f741d4388c730cc60ed92fde"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "aee383fa05deb7afc3314a6c331cb82f62c389b9da0c5c215b7f8844939d7c83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "019527ba7a57610348e3a65c46886e8b0e34ba2a35f116ba1f34bdd6d20cf41c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2a42a3af942f2342df88f1fccca8cd0fe19f8c9ed2e18b31b26e0e44042f57d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "7e77730390da1c3edd0378eff510725717bf7a64f53b7336cae9b1a77f7fac1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "bbde54f31f68c9a7d8d40bcec54dc52f1f3d71fcf87fe62df24bc4cdfb63cc85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "eeaae58567a08e304869294d8f239642d441d42a967f5beb2addeeee0caeb6cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "8d9831d3d3a2f572705a377df90093207dbfe79a04d4e6492421ff0975f68f0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "63ed8bb2b0d8111c245b4ec9ed65fe23bce16ea2f7631f61922bd980117199ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "391076259ec5f6c8a15390867027283035a19f967ea14f6aa972780655ee091c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "72c931b675f483c07586f1eb48d04aa20b6abd5caa3f9f65affe6773baa31e8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "46669dc5b559730f4d7b7186a3600ab7fb2ab9cf17a073916031fe7a6e9a98e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "29279fb385d35da63b7c1b34e25628bf3672a6f00f2a2b40a8f8bd4d39efc185"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "4e05d29bd6240f17fc052a6533eac4000be017b36b41c855c3aa0ff20696a602"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "501ac52377485b74711d0451b52ca3dc805a689a04d4e8c6a1b2b4c3adbf23ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6a4f17b1732385ef0544bd91f03936bd5f7b1e96b46cfb8329de2c3c6ade8400"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "3f1ff2ecc49b1656cf6fcfac6b9ed9ea2bc090204eec880e9b90159ef3b6631e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ca4a000a7047ec5d39d6fcb232d9125e5acbcbb3e8340ce410c85c41d7a300f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ba474d745b71a64520359aa04640f4251bf747965d39a6bec51b5bfebb065611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fa39ca46ec2c4d1a03eaf818738ae92aa9d619dddbe87e146bb04a2fe7c13050"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "0fcf2784fcf7e61de7061188939a2174402034931bc15440f0f542d5239f61f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "a38f7dac8ae95388fb0157508c9eff122e1567f79ead4ef25e3bf9eb9aeeab29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "81fa52bf7de3e06d8f99cfe638b72d869374003275938ec18d917b9149d4024b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "1c38840213545840ae89a0227cf3c43ae5c47a2b48e640300bd5083dbc603d32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "e462821cbb63f4d3b7c0c79bbfead487e90c37ef71e6ecfd8004e6a80faac8a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2d55b76c01357a19c9017c22c203a5b3dd219b1d7b88286145f639021c20d763"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5cb68ce196e4e4e7c8c5d236e74a2c3f9e9b449101935544dc3936ac0061d9cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b4b10376b557bc7facabeda4d07a572104a74aeb2384a7cfa2d94ceac5ba0625"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "47b5452198e78b0da9684ce371cefa8ee1e1963c515ccc2bee2fd6b39ff11f66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "37975669f75941e9f0bda1e11c779d059971322a0f5034f90dd5366fe0eeedfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "cf0587d5432416ef0ef27032acc7b082743c9376c01a2056715b345fe3f29a6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3ead0274ac5df467dcfc349bd9ee0a40892b4661f2a69af8db89f829d2f13f2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "31c8e472f1403abe372d80eb9da203053ac0ca58d91b8cd2e9e3fb9c759d8b33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "05778f477c3594552bc8bd89867e87228535b68fb98386c501dabc60447b0ed5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "d7fb87f22991482e4ebb4745d00e694a92ad88f07310543129360a1ceb92c9e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c658575b8ed72b7af12136def6c6aff15f5397e8bb0d592e165d8c5951b5a3fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "45f64488a98c9496191ee3821a03fe51ca06fb8578dd8ade92aef3ac94f921f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2afd1ba33366aa685358ef612f79bb0f60acf52753af9ddd9199e11d6e179966"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "2703d80e2175ddaffbc253c1d9c93f32b5dda54fa138612317c43aad2f3169ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0d96a525fedd3815c5bbf6868176586a284d32f5fe829d80a83739bca1cf47a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "72d325cc55f1fd2e9d439a24b64f23df2bfd06be5400d4c3d37148f94c9e269c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "7b562852edcb44ac886b5690e1a908b3e401d77ce91f9f78527221073da1e372"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c4e8e6e6326f623a4c9450348d5cb63a11c72315a1d4be687d4e46694c35884a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "62a34913bb64cbf1226d6642a7631088c4aa80ade2d0f95b2a6bb8d07b7cafb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b646b69568704986a5ecc1701e699cabaf7d68526f5d5916c1af5d3ca6f1e877"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "eb942c0c539c7f3a2cc600f0168f61cf61ed0dc32ee81f7d95d3cf30faf5db4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7031bcfdd6cc2c700d0917b317885db7b836b7d38a7731e4e9d7cdc20136f9d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "43328950101b893a47ac83a338de12b7fd2695aabd2a31bd8db59bc6ff7152b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7c4356066c358fcb1e568f4c3348f38725cde670ac2b11e4d8e395b7e4e5c989"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "34da3691db160f97542cc677aeb6c9198ce065f9d6e5c7475eeea8b33e22bc82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1d4d4e433de562e91679317506176040aa0382c254fc8ad8768726c8180a8ab5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "19091baf1b9e3b3f8eb1f236ac8a7de6a4c4d4b759979659358b053c541eb846"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "31be801ad7f8796150b8c35b66e471637fc64b93c5108cddd97faf97ea51c09c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "74c328d18a8d54a8ac37728be1eb2b4aa74b64c1960714988b60aa3d3498e74d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bf29e5435c0588b6569bc66ee714ef71b0007f86929e1ef7ccc125f43405d0ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a2fe3860729ba1112ce2e5d2a9f2c9e5a38f90e79f305d4b1c41f34d9617a0ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "0a34d75224a9c8648405958e66b89855daea6afd012df9779aeae96833a66f5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "3eb04df19db65788c92fa69060fcf8b381242d9482817bcac8e0cc39c6b6990d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "98c972c039e7662d45ae3cb8708a07a17815b9b57a79d2cf871e80b43d6fb3ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8712d73b8f02158b173db904ba126c113d481cdc26b590a26201af5cdc562dc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "b9a833a3614bef1b7c6584a309a2170cb8d82ad188d43e2e2ad461886844874a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6e138f619ea788cf9fcb3a7799ecee2b06bbde9770abdee84d16488619345e98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9bf37d008cc49e5e893460ccb0332269cb06d335aa08c9462abe9f326cea980b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0cd7691584b519e1cf7194dcda4c908dc7cece5b8f3962f1c4bcd3022929d6b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c35dd73dd5a23705916589fc437c9a6e73f9c5b11e86ae72af2b13f9c48d95f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "9685d2a63651ed33bff9414ed2c84a9f0c28975822aab050e821c21561c154da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "70ffef5163911ea92d55129bd9d4bfbb7404e3611125692c5d9c554ca187f4bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1598f78c7d5bef7fe0aec478125ca808f442a25f7ffffb03d336ce9b63b9ca85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4900cdc061038e491965e5fd10d14d3c37224d9bf5fc2c3dde8847358ef5189d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "f308716d8d38072aabd7c9369a412e0d4300d2df6f236d30c56f0b3c44affef7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "fe4e66447fea9e096e9acc1c49ae098b5ac23215f254b22c495d90aa454faef8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "be0801fad5486f11da78070e184476dad0655ce09582dd1a5f8a5e3ceced9e98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "4a6ab9df160fe069e367d505386a54c60bb0295b1ca85f9618ca39b207c514bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "b4704acd88c098b43e37271ff4762511545cced3bb25c722ef7fba089b430bf0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "351e61ad77bf8477a2af30af0264f54811d90cac914e5e30543bfde0790242cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2383f496491dd6588df816359d8fa82d6bcef50056bc618904cc289fa4cec19e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5e0a8a972a5699968610605db255bb8a629324fe11e2f08bdcea9748813b8912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "de6f7272a6dfdc3914c35f9566017363f231f63a9681c152620c223b73fe04a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "d27da81c6659728d4ee5961410a773cda175bf4c4e445ee54e4a01e2bebd0f0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "7e57bba056a7107fba89d319bc449735f9bb80a6641e708518b247f942be8a8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "6661ed476672dcdd1e063fcf182b63db2b2c6271c911c05a9cc6f7df352596c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "47e9350c0adf1ffdea73ccd63a848ce7cadb07e8bbc28b12fff8c49ad3fd627b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "241fb524bd138ee8c53ae4e183484e03ba93738c6ad614f590593944e38fa6a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "87edeecb4cce53db5d62605d44e4d735f485c8c9e847f766a5a80ab8427ef707"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c1ad7e530fd9c52188000d4753c9f1e2aa3dda1a15569e6cff64f6fbece37e5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "aada74b7e7ad65b68518218c2e45d6833e1a823e535b60dd98c9967655fdb22e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6f5f5530d76b9dbd1e5944377b95e14ffa759c72fbb0daedd6d71d2f40851aa0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "04007d6549f975506acd98ddd6e25804d4c3c9d219e609f857f0bb72a2ef3043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "eb52b0f4ca87092113d53025968ea639acd75df306cb264c489be2138c6e2e47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2278b2ac55b4aa61d50585f5e6c8db837a709d0af819f3e7f01512a97a3a49b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8cd3f1f4103c344c477e4e9aea3e23cf85983c17a248fcc1a230306ca1099d69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "33df46da72056ff1e15c3d44060d229ac0a9c57ce6b99a87900b1b3d2c8f4cfa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8963d12f8aa41ff24b5431c7f4aff0b63c59b9528af1488eb57438016f0940f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8965c825ff08f93b172547a6373ff351197001800797a9043270e9b078523c87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e0c5112d7e3e584bcc06d8b4249d29f7bd1955b473178351e61331c1f8c3c4da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "c1d0f93390cc7e872c80f9719282cc789c33e5aa1529b948ef164177b4c61c5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5ee12ee6f06fae8d40d29b177980d6453c39b0048ae822183c7114e67967f982"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a5c9cc267cf730f9d5f718200b2a1caab63d32b13f0a5f21d3f92fb793fc44c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a3fdb7b31815b4323ec1fe891139633d2a446fa4538a6a54f5e2e842c787a2e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6b9c574ddbf2e1a92e11eb88c51fbfce39264ebce2eafb0f58aec91807bf201d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ceaaad1d721377a2565dc786b4f8bae593f721583fef16b9a504734cf6d0b236"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "ae2d687666f7fd94de0ebfa2e75a3a60153a471003b79fea86da7e8d883ead8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "66a949a01b8b1aece43a20069ebfe8cf361760b84cc8c8cca7bcccd6aaa1a250"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "804b6040a1eb11a24cec08961285ceffeb02c39cab39730e1055d09c02841697"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "1459495d7e04420687bb341385b0fd769e6694bc22e1565050676151e926815a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8d93f1f86cfa8d013a2e3c63b21016f16d377c8a8e9b95d29d8429a2a8f67e9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6d41262fefc6c45f2f63987a7067817354ed04a7d1d45d1382460f001e994624"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "7fa07da70f4291da1d680aa65a2bd5e33807954ec66c9e3f3c3d4151f694c760"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e36c5e73f64e3a11cabdce12d2f0b463b4240a41642fe7451dc57732beaf16cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fdd8c5fa67faefaeec67c8cb0a9dada163ac07c31d3e0dc8c25070cb525bb747"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "598c7e78939aa155181e26189809a84a18d8cfd96b7279ad572ba70e69592e12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "34cee691561fdb5a31de656ea70c0d3b1df5d2d6d5815cc7e3f5c0720ae59c17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "34d7fcaf893f1f3ebdbe3bbda08f905312a303d64543be25eeb69493b4e55db0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "d765d7ad3475aa8464cc0ee60de1cbef588a137cffbcfe9a7a62479195525d36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "0b3bd3647dfe6ff0cddbaae7239657ae88900f6ed348d62ee12b0a30d28e82d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a73ba38ceadd30864c7babeec40df721b8eeb29f6771953de86eff3d74c4a198"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2cb054074e5628c8fe693d8b08b18b3855d8952b1d083b93e5f94385eabc660d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "69f7ac553ae2b89e26ad60d4488890cab10cbf5beac51778cddc680e2c4008db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "7de4ed9d5bf2a3e18039600f16caffd59330c0bedf25e5c86300bb5d963e1105"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "5ef8a09ed9a9bdc64b365f2c75940a36dae3c2491a0d7ffa40949de189df10e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b621ed7409266513b91b73518eaaa413ddc7407754077c823af678879d716e28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "4a971dfda7ec1cba735106e951288f051d8e02f7a526149e315d16c928970d7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "59800e25de2dfa0573dd55d3d0e804d1e6a62ea26ba104cba9de37b902d19d48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "cd11c93ab4ec648fd55d00f913ea188cf6194f34a15fd42043ae7dc2028e6868"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "9b06aa415081381c60a01e26d13d5bdb44b2b29d9039329904ab56d00137c1f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f7e87e55cf5905c06dcd2e085ee7e665fdd6acb9da51d20eb757dec625ef6ca7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "153f455f2edf0889e67e019c340e68d38464f5f54c5b9b18353d51e7ca7b029f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "b2c14fe6ba7c9355f388338109e6d611b4198f0f6768f670826b256a17284223"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "64be4d718a221a6344667bd009873779fad3952a10e93e5451b01fa0682e135a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0885fc7a617e31dd13d15e5cf16538b21ffa4809369e9a666fe968641f27d45a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "910bdf25f9705b916879e55dca4778480250c24d40d1fa565ee98b5627ab86f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c09cbec390057552b47a3ee5ec2177ae12af513be662f08a0758447d5f5cade4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3fdb7db579a631a0bf99753fc9d3deff845b75adc17c06459004bcc728e3f9b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "bc80c596d2ce249f624423cf4ca356cf840af12854691c8f045dd9f6a26ad863"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "a8221f76aea09233e8d3ca90c88898a66db3c2a1686cbfb6df469bd75ef52c15"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9950d5d370733eb88a505cd4bf87c10e5e563fa5d3b5d24769b46b3502c05312"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "148746773dfe7cc7e7ff6494832314921da82f37919f5c5602f812b39fee856c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c13a48d68eeb1fccacf9bf30483a3931cceb085f3cb935b539aebaa8d42050de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "52115edbcd44debc37ece61e085449ed03792f01efa82e915b56723fcce0abb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "60bc10b6910282f2a72949426fc2a6e07872afc458066175470487d894e38bc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a753174bb6b072db9a0ca4970dd93ada6bffb9c9539bbc09b80d5721262f8258"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "84ce1d68690b434d80985169184a412b43b34e603e771312c0af42f0d75f7f92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b1acd0a84918f5e9c8fa1182510a6b5c19f52ba8b37efc5d7f78840a2230c271"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3c79197316655a52364e58d068b84ae0b91376ba6febc3e1af2f50a13ecdb876"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7e1a964a59c68db1e3d90629e02153ea670b51d880abef2ee40140b96d5bd651"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b046c16f1436e1d124c13c331083a268c345ed016f771a71fb718a8a6d6418f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "8db7137b553905a71c06c3ba08ed1e69e9e2a46367e29d25a57ec54c438148b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1683518c5b41f712924bdc4c4d071b0a9ea6454e6529a1352d9d58540afc88b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e1627366009aea6ea1a8d20cef14448f9191d19881cbe7bfb06a29b288fe79e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5256256067dc47184caa5ce087a2755c4a052112c0c9e90c3fab7c32efe6281e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "e84ef3fd43d21ade20ac9b7eabb6ee162c85e4250d55bb3d104b0ae7bf7c7d41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "4dff0c29a397beab75d09088fda59004e140127a242c73c9618f1375d6105467"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "ee9c01957da165d852c96959803bd083bf813a4f426fdb8bf268beeb1c0a75cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1aa9e348367025bd00b0320cafcea40d1ebd46bec657df895978d8ea803c3276"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "e47c884631dc20e22941dee76d312d0300e07b8bf19672dd57fc28a264125f7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4f39a69bbb2ef46ce85fc56fda0b35acc5d4160262f3060d0fb05b17895980b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "406b427cb3135aee7cdf5bd195757a8b2ea113f05aee16d20b3f3ebe321b8608"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "baede93917fb6cbde6baba3162a790a0599e3bac17ba8f267f30d52281112bff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "423696e2f0943cbee92e92d1488dfec0ed4e02ba0623a110b1a7673b14b4d6dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a626b3ad0835bbbaf6c942dff944266c9ecd7d6c5fd62e51163db6a0b3545d23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c91ad169cff56dbdbef650bc4e5ef9a800c20931c55309958eed4fc616432be4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "95378bef8975e6e53ed09a52793ff15c30c6dba8503556c45a7efaf9dc9b6e4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "80de0bfc61474aae0329d73533b5697b6fb9a66a5c057a356d52657582bfdb4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "eb90ad0805b23bb5c2b9da66b2c482df66d6232e0d39c385177ea67bc9a960ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c811e2b30629973d0dd178ee07b0e43e8cd98916dab88e4d2a697bc69624e383"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7528938d4753c76626ac90e083543b6747a97303f303816e06d6c5f24f528e64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "959c04385e6081655f58810f3bbb955733589d34502c4482b502f48c4b45429d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "f004b2ab63f826973b38552d480ea0f14d932bc139a00561ae0f1073573009d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3981447dbdea0b66c39106e3fbf6d513f6c8c1ec135092b97045b3796e9752bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0331233809de4afa938dc74d4565ded6614d012e11d9fc9ce74fa57be75c323c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "2a79f984296da5697ccf3fe2e363b0ec3a25fe886aac746a499a874951c59cfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5f8126a47fd6b71f0e35b81b9079d6be5c74972eb84522ade8b1fcd69e140d8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a797815f8d837129ac927a8d4a1475761ea700b5c538de80c452825933f164ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "14c8fbe2b6c65dc262b06a496fa7d58c24260d02ff6e5b119538e55b696992b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "84b6d33d6140fd596eb6aa72a91e9b4ee8900ef9310b284215decc02b49d2324"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4a53979cd6af12ea09f9664bcee5e773901d62484c747c12c7c76ebe92ca6dab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7812f3ef6b3a820399c6995e587bbe40712f68ef134a392a28408ece0f620046"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8c99ed85732d06b0afb726eee3a0f19c5d81e56bc9a0ea88e096cf75545bb1e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "5566e6bef0e822a29c12881e5da78e9dfd6ac6bbd705c355137eabce07c22ed2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a0662db5164150fc9cfd23cba178ef13ad791ff118209f2630911c989c5b82b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "49bfc96f0244188f5ce4a26549403e366b1adddcbd15f0fd4de44b5f70d49681"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9c4ddf296a19ee91b8d655e36f169c6d40c613c1fc1084aeda6ca576ad4925c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "980f13a5122a2a548372f7b11f2808dd27c4d5279ebe30456402e546c5951b46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "feaa93d764c0b6cd718692cda5d53b9aca26a23a862faf422244072c0ac6293c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "67c9bd9dccfa2137a833967569926a9a48ae71758fc173ef7aedc3340a169c5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "4f6b1b4e9bcb5cf4dbb0db805e4e9cca564fdd798e2e5db86aab019e046682e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "4c6e35d28b55c6e5b570c4fb6531819242c3361ffe577200f25213183e22f56b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4538f8ab04d67a49601126e79b6d75483c646d19e941c31d2c1c4d71f52e3e14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "883450414650b68fca435aeb09a0ee1a13f7479d5ea32da7fa2be5288e511f43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "6b839f55720d915f269cbb9f28a057ffb5d225ba22849d3609b473fac48d52cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "e1205948d1471f4adc551ed421425452af8af853012fd2ed800c98662b89b0e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "7910a8ed0562838416dd91cc9d7e98dae72dd0cb86d194bbf9526f3e06210a67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "e99641906d5093837159217cccbd8c3cb313130aaa90633948414257e0329398"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "a3bd374cea0f7bdf14a895febb062e304e2c17259a6d6030b3ca8044241a2961"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "d96234862187f5c0b69657eab3ce147a4dc1d0ff344252a84e381591d9d16cca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "658e11e6b70c0b2ee8ce8a12a9b3a81b6ef64a088dcb355de934a859fc8f8758"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "05ecd4120e885bda1864b700e3d26319762b39b94959ac83eda0210f4db56dfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6a32168404555570925d1f982dd4b06cc2b39cfb46ef416d4900ce804ada7b41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "206637817bb6d3e6c8673a4ecfadb6abd53cd6495c4da50a8e15f4ae81b5290f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e310651d01265f4f7da473e42ce441862eade57b9d1ed6e44fa188d548b4bda9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bfed7110e50ed8e8eec8bed9396d09dff1ecbb1da94144798201f42147885cea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "457077e764b286aecf0d5074bf7f5eb78ad4463d51cb7e265300b7557d0e564f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1a4ff496b73d4155dea46e889a79e27fe74c8c8e8b4ff75e1689a88ba116a318"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "5cafea2620bac3b5dbcd37610e0cfbc1755d0638b7fc7ece433b47d5a1f14cea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f28d3c19f98a25ff13e5fa5781a548396bcba2d7967539588fe3c980b1ca88c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7a68368be8f6ce3dae444e46ce514dc5dbfd69c5bae67dae1f697917f736a027"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "67cc1be60ad4bc79da7485afa6de4982a9072a3eae0a8e42da3501655e337912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0447a90f37bedd0416bf43904d8c7ee8e6881df151af99dfc8c57e4eb26b3f62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6603e00ff7314659933f7c7641c7df4746ea9bbbe5c98d789186304fcd433d86"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c97dba07a33444ec03c02708b466d9d171ba975f41d5ea78ca295d8ba56659b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "715d4806669ebf36fe8133cfd20017426ccfb40073bc4462167dbfd16f4a2fde"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e17d3d662a02fa5b32f66870a4a3e661cacf4cf86a3a1ad27d23f506a3a0724b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "50f2f6208cf7cb9bffbd698ac1d60e355e0a9988cdc16898bf74eae2a3d273f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1a3fa69b69c1a25b0c685364f4e11b1b465b2ece4b4b874aa442efe40d0cdb69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e7269044848b33ff29f19faedbef5737e13be539529c38e716d05ac6e8159362"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0176338033423c0afeedfbad2113e2d027e376c350f0738b09661dbea7e7bdce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "1323ebeaf5978c782491966bf781732c8c29d7a9673a4e36fdb3792e0a1208b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "688d102236311f1da1cbe7e0c9645cb1ad40f6b5d62266e9a562a5db2416bf01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "839586a52b52f3ad1cc30ddec1d865720a436b1ede78d6213ac4d727f4f7b5fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "495cda2d918fd1178fcc22bb3242291393c163224adcd49479edc22fdb1d3199"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "209188465cc8938bd0aafbada11ac767ed2780a1453586e9745c07e122e69135"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "7ca93dfc02e41eca74d889581e885e956beea23d840243ff4d77c65b65d13976"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a7d63cfe421a43a242a38cf82a1996662893b2e528f9e1adb3d6580d7aea83d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "c42104348f45e6b4277d6f5f1ce165dd3b561384516cfcaabdb006f65382473a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7a138ce29123f321475c5f6752ba974cd1a8e68532363f5006651cd2c9551f4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "c9a227ad69aacdaa007f10a53afafe0f386e3d2ab2123658e40376d623629812"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "1130cc15966a8c49f2c80f32280f0882c2e6d52944b8192c8fd419e40edbbedd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ccaaa9ab2b5cbf62c8eb40ed057a68a21c4ddb77a0e7908817f852703835f8f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "7e4e8f31b5c628f890bc7c2373e8ebc1244ea8a4071311ce8825709d7443e65a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "4a5781dc9b5eca15044f90113fa38e88a475df0f2324c4ce75f218d48e4ef7fe"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "704410eeb2eddffcf8310c911615fbf0a3781a92f33901d83d885230db48779a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "cc4b4a97807871049645c92c4086660f5953ddcb621a38e0e01973ed4c3e959f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "5dbf8b4f71a0d886f89c4fdd559fd5595f2b481facf446f95a311d10c668e86f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "1253f5bbd9446b02cf6ac9466182b8a3f0e49a703b1a029b190a494aff877018"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "762cc8b42bdd09034a15e812797d011e04c7b2d274611b58578c8184643f7690"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "61d5fcf85fea7ce7dffe09d1e57076a3db43b259a2710e389b35872e74fb7f87"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3f25667b610181750946b1ddad855e81e3ec49b69071587f1b3e669969146ca0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "efea722b3f48919c4da1159892db9f254f7bb506e2ab38aa8de42d28a4e70361"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "10100112870619e7285e58c52cbb0780d5eeb314204eda917627d03114655a26"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6a6cfe5bf69537e3a192acdfa9fed600fa266bc558c378d483576a3d899e1f33"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "dbdbfda8300d807f2ef5a3c98824888f261152d41b62498341a87942fda86bd7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c7a9e741983439cecc818c40c3078d5280067816fa3def50a03c4edfd2e8ed1e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a59af301da3a3d84d80b7054493329afe7ff2cb279b805168de6a234af0c3d38"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9ecf0f185439f6e4c8b9713eb018731a000e6d7735b3c1061911f59b303593e7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "65debf463b7a872eda4c91b5fdf8f462b93859dd9550c2e53f1197ee51b4bb87"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "abb394b2a3e481f182a3d17e4d0463aabafb55e54c48798641256290730815d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "51777986b681b3f1661b9672013ff79f3179cbfd90f599996873cbedccaa956f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "66e39dea34789636a4cdb1aa35095910487b03ca0ce127dc559d4d82e55f3283"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "df5d7ed6e5f90c026fa3ca07040234c74b954c35c47a64edd1d21da666f99024"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "eebd22a68a735bfb8f0229a0aca18dfa8d8c10d587b67f16728957e376ccebd1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "059a3c18c97395c01a4d282ced0dde4621212d86028dbd31537f96b164638661"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "24b3e50ca49ee01c26a58e0e782e7c6630e2bc735632c4f7bf2fbc37e0d3f53a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3c7b667d2609779c5716f7081547fd76f4ff1666bae2a67e69587cdc06b89b65"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "d4e9c828d5c1da0078ec23ae05d466765183b283538e9f540995e9307b2a5d19"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e02a5c88ea7a806b09b4590c628e5f42494d8ca8163738915a638cdcbfa6f76c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c4bab577692edbf131cb3833316b1ce1270423cf7f7835869f6417c111558aa9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "6ef548c664f70affb2063152927189dc7b1af92c3694db9414db2c49ec4be1a7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "843b06ebf476440c52e5e9d754c710d481cde9a51c7524f8f379b8723c783a8d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "b6babd03c0b3d52cdcb6e12dbfec2e626825cddeaf09fd9ace614ef35cdd8624"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "5152ff85575163813b7e41b82df0e57bc049261b0996665fcc6f1d4b6ae8be5b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "f7ba7458f0fa6d27002422bd975578ea896e289d099286ab5f5104c797dea0f6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a6a948c4618d1ccd657aa7566bfefdfe7d310b19bc23c9bfd84e94b3ec16171c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "efcb144ebf396ba2a65d491e52b2fb7307decf1e4202f79d94570897e9fee3c6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "833c6452c480cf677589c01ae061ff04f0b1286e527966876a0246c102aa4e96"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "b8e20aa98b0383ff2c2e10d7a5c2a28e05c3f1b5984346cd79d7c7ad6f4ba536"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2cfbb4c06b5b96301667de88b358bc08a0195130bd737ad1735d9088b4162347"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "72ab816366241c695d6454ca621654c42cd25c5e3003ce7921eb2bbe15012d23"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7063cf10ee0d6a09cce9866f7d6d425e29367c69ec693a5c7f5e54cc6c66568b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "8dbf56b42520af3cbeca732f098cb422ff568f90d2a49f1aec94a3b1379e91eb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "317484e83341e5482531c6f9d7cf3cc53ee89d16239d210c78dddfaf98b62820"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ac81c73a1960d78bd08134f7aee930bc4b20335635ea9cf426265b6ef7b9e160"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3016609032971396adc9d802d7e19a1891d4800c8fc6d634798b39b7ada41141"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "b907de74e12686ca03c6f6d6846b9eeead660001644709797ab07d04beb2ee29"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ee3f5059f5239a56883646cd2116b7cfe6e5f20f8ab68e438a31ad9a0d42682e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3983c9c64d338a4bcca8c652f5722477b79926bc3a72a6a0e216772a20f29f28"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a5616ea89bd462a3ef5cf69ebe9ada66ad24f0155f918daf527ee1d6564ac174"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1b6e0a11551d849d3eb85f62e0597c43517bdd2a7d8618715fc1ccf39239c04a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "829f46c2f7714998f416e55d05c8421fbe8378181130256e31aec54c395764a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "22021c721b964fc297b8a50a8b6970c604ae67a00c6e113d6b0fb522d0870eb2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "89e49f79327b94f21e2ef093781de7268636df9133759f50492fa9f2503ed796"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "64ab4e35e0167fc401b7eac286f1294e19bfa56e6079097e4dd3306bf3ac7d7a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6216bf2a858e9c3aee5b0b1733812fe72b6685ee804e6b79b478bfadd0a69936"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "711eb9e56bb3191cc107459862d4687c79a6eca832a01b48c94ac30653a8ad99"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "79e4735c7b57fd0b787a3c15a4a7e41ccf1617a57f7ffe4efd6e673276d63450"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "24916142be4cd50b24745ef3e55110fdee705f75a3020b0cdd94a9d068f524ac"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "688b8ce4e957b346fff90c5101e0056fd47d499d0cc4c0354fbe867798569d46"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "3e203c4e049d3fc283f0740c5d83dd0217fed20bf5f022422bd79c621e7dc574"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "78c88f365784abd7a38f7e2664001f0d05f4ab19bef1d9ece841d17c026de06b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "aabcd4e33b4b5f1067ddc97e46c374c75d35a948a6b1ee335e1806c6b73ce6d2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dab3f695ec8ab5b37de2f1f566f9b1f764d3db521752ebccefd7f7ec3282d598"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "a33c9683238d38f31c85504defdc345fa0cb24d53cd4df744819f2ce181b9742"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a24b321a1f96247d7e0545b2d9649eb32f8cd3d63551cc0e649f686b5a7556a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "fa361a848e018f0452c0f733781a3368ed80f636cc3b8be15001f357f6238ab3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9286aafca5790d5ad2c88ba97fe6ae984ee50b886b6c56ec016dbcd42c91e8af"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "548cb1e7d076662a5e7f010550f8bbc4938ff1b5a15f2a106c9dca20059e43d9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "c630f19ce04052051ce0520e62d20026382271e11fd2a102b415c4c5cb7fe984"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2eed8a30da50e45c296e04978e7f9b22436ea47e8433cc54f2d0c3c0be0d94a1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "845d65ab5d63e0fdf8253c31419c01de421863ebf55d495cf833b36f6069fae3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "35d5b443dbef04f5dfe899ae82471522d75415dc64342ebd4d116587f6b869f2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "d8b960623c7c0ac5b17b7f8e9ec7ddd90ffae3bac983f211d6f9e206e48a688c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "218d22065b50aaf13943649d0c8bea69f1d36835eead404932842a1399d2f37e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "5c2d3015302b0d57fb199d5f32bed7d0ae18b2acec77e9bad6bfdbd9b7c63e7c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "ad2fab8d629725abe019dd6594aa2e0c0da07227277e458472ed33c183e60ed5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "d7701457033e0b8eca3ae18cc2f855aa032a7913c1490bc7938321e0ffa20d96"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "838637cd9cd0cdd7bdfe62f3c15d40186abe629a33f49b23800de0e7f3e0d274"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2b98bafda510212d1aa62961f57aba9f5fe02b4034cf47d6dbd233a993628fd2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "4d065478aa2184ed32da36271b2c2470ea2db62e0aa6d627838389c2c98976a7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "ef37a3ec27623390674cc38d2d8e641b4ac7eae3192c121f99939b34ecaaa290"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3531e17fc3bd7efa5d4f8903623c1db3b1410090f0e8cb10ddd938e7841d08a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "39c32825d3a8068c3ca41dfdf89a1609c6c3a66bcf87d3f2b64e759a064f2560"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a9dc5728f95c78fe1d02b891f68cc465406aadc43c8e4a72f66877f104264d2b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8a244bc46e1df8f0aa55aab8beacad553533433a6d659fb25a6752b83e3ff8e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4b87e3d18b9f5e7159a06952c3772e1884543c4e111c8059e01ec094ff1bb848"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f3e87f5ba22aa42a7a49666bcb66630e576cac6e600fc0b67a6fc0f3b532c19c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c245a7cf74ed8f6478ac7828424fb3f1d632a52def027ede09908c68df86264b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b6736d76875aa8ac2b1e2cb2b21d3f5b8eb0cb9917ac1cdb1f6cd6675d581ec0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b77642d8e0c414fbfcb46b6da94e122fe17d851b7d5c7ec4befb0db6a4d2fdde"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "59b8e9a82edab382860296c2f94618c09630d182c1f94d5d8f871fbbf688a4ae"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "04ce5c7b961c514a02141a11ee77522b44ad182ab55e44ee2342e5cfdf0a9495"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "25f8d1234b7f077772ae452dc598ee70b0ac23f7e1cfc91df927c4986408654c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "eb84996a137529845e5a84cc73bac87229086fe169d57865b5b87e67e279ae9d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1ff2c78ec9ee9a5d1fd0f146e8b5b7550e3069e93f98205f2e7e0365da321a7a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "602aa8363ca97b97da2b9afdd265328829220ea9ab4cda33fdfebb52a01b8dde"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "cc5dfaa3f00564661468f5e3b1bbb54e0d8609c9cf1cc5fdc71f27d842e9c775"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "69b8cc8727c28f15c10a4f83a14e47374bca0aa8494231f0cbef9997b9609be9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c5bb5e0f762aab702ebd8cc46cee389d947b7dc1e1609a81d9e94394c15b3d4f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "92ca6363f6963a530c9fc0ce5e8468991f35d77cdba0fd3b0a2701410d1d860b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "1f53a056161d611a4e93482b3e27e3392b462c07bde3d7c1557797f0c68f22f3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "76a72c616c0ed2c90d2101325aa4ca0da02b4bcfbe1b886c36b0186505c4c836"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8425faacbda31020d39aec048ee39915fd2c141b3cfc341287a94eaaba10e7ab"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "8eae6279c16b37a1dfcc0ad438be92e7e5150f7a1a3c9d11b1728f6a7d0acd18"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "db2c5349841e5d10a0f1a91c478078e525909a182fda316a5848aae579c242ba"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "3bd9e0d239b9263b9b716b4395cc6f2892bc9821d0a912962b36d7e9a00ff464"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "51a9d682592bd3d99ce85101e6845d51ecba1a24afdd670ac37969f246cd152f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a775626eec196ef2671c7ab2d2c932ab1413cabbb63233ad7e98c8b7acca2ff4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "117d50625e79009d5760d8fadb93d9702911412d1fbff01266b3cdcb13da75f7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "a7de92d2d43ab8a935cfab166362e954e73f32f2b0a416c433aad65613bea234"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "815d8dac58a2b23e4579393343aa5b9f7a225c398a2082b853690f3deee4f7a9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "0fa3aa64ea218fdb9f0c95510ad9af44745f006e62857eb3701b2427863bdf8c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "452ba6a0bb31d44cb212aca2811b430a123f004a8e85e97219314e9aaef3f4b5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "f26d583cd5ae867de5dcaff89aa3a461ad94b17c927dfa961d2179218176cf26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2bc8e80b8d7c56ef9a73a947d49da38ab6905cdfe8f2fe5dd559827ab9f70728"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "573d4e1b264ed9f1e8f7af9010fd819604d2611df31f1de9819ef78f73b4b8c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b2d6c528033e2d522c485b433d0ac7e27dd7f4a5dd3455eb989335c8652fbf0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2e9705f90914a191682bee57e1d9b23d1c0c8b2f8baec473d034c282df9f0435"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "10ce521fa0567c3d7632b3bcea1f9d31e2b3222b9a1a7e0cf5e872a79421d242"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "79fa691d612f976c9ff58cde2522bfb7f29cad7d4f7db3e1cd79f3d70083047d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d7a1ccbd96d62dec1030a237d18cf832410235c31ad8bba15bebf5d13d27b8fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "23376bd35e353c5b37fb5b189c474d214399d0b4e2ec1fd5abb9af804db72842"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e160a1759a30ad42ca494112a508cd0137505b5a9a319eaec9f3d9909d74e70f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "02960dcfdd2c87962115c42f8346a2aaff1461df5fa6a6078aa9f8e6e666e4a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "902b64ec123f87100962a72c9edeb84a7f0a5510e107bd858c8f17c8f1d21eaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a90e1599fac50de964655553242e49d9147824d4c7b42b075b077863f9889c84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c910c75f77cf302c47b2e76b962fe727549f1ace81e3c59eb859e17e3cfa56fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "ee4c2a5db7d180995f920b30d0dffd14d3f99a42f4a70586767ca8eb919055f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c88de18a06779ad948bbb7401afe2d5d89395fd38d94ae81314a3eb8d145c162"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "64255f3bdda4387d3a8cf90ddfb2295126457919c3813f8ec50f23b72e9ca55e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "940420cb041b33b8bbb5dec9475e3c0e508aba8229cd5b44710218f2c3eb3108"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5b1eb920d08199a3f7399e591108d212c2540c3b2702635a496cedf32a31bc60"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a78548d076e1b3cc65113d5aaf964cc8f23b50fb5e9688132133381745c44908"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "fe401dfb33d463aa9a2ecc75596b1f5bfb3b0f79513277fa6948128e2b1e4a92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0babe1b811a1a99383d5ddfc17eb6a4adcb63e49b94d9594bfcede027f6508ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "be4e5ad103902adf2159d80954bf0bae6eb50b9482752767654329b7aac6c571"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f3fffc68f5b3cd445116d3ea8489f194d7672956488606e73685ccc885c51d77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c3c9a22313b0b5a602dff57aabb725096e2e567a52b5cedd25afcc1b03d39945"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7935245023d556cef211cdb32942f1b445ade245490cda35b069aefeb336f711"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "72b9359dd2bb7c3b7689e2e59db671040f3c9acff90e85d3c00efc67e3527e4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3be7aec13caccb981a7604eb0d823f6dd07a32b98f30d278b77a611e39a9823b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "015dfddce9ec42b4c141c367d595b22e02c8ba688c5e46a3a1f85f7f1b1cdda5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6c96296d658edc3eeb49c0f34f1020cfc381c8f8e6f91e5b0c639fb567e85a21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "ec1787bd8992aea3711c739f4f06e1b924778e2aacb4d86efa46e27970894056"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "cc3311ce70ac9a8f2875e40a47ea82a7e211ca16935e2dd03fc9e5ef7e17d1c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "cd012db5345e1049bf3691139761d8788c6679f1794a7570213591f1f08d6d28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c2177126c20ea4e4d34954df09aebad221937e68fe494eefd6cc9330d833a37e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "6ca9336f711a0c9ffa3470f1e8a616b8c8605926d8c0b067a613528f631581da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a5382c6dd5af4bd699ae51add4181981dcec0d9a687f29911a18fc409798eede"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "de8b376bfe5db6c44eeb6ab704c2c0cbd4d4babc7b51ca043ca0bbc1469e3a50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "48e09d43403bd9a9d99086d6536dea68bc0dcc1340e0594c8239d6077113dbc7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8022058d4203df215ab855b036699a6d257d2748bde0cd09e24cb4639ba7fb6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0849a1b35c5b79db662dcdd2e086fe8f68aac99ea30bcc4055f0eac3ac423db1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "855d7469ecf3d8544a81ed841fecad2ec07a1ce0844cfe222fd7fbe8ef2ee88c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1756ba5f500bbd88ffec1b4fe137a985d5babe589b0ebb5ea5975ff728741e17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "bea9ec573990d2af31a16322096c811828d25a535f822be1c991bbd7ab296832"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6bee798cdef48583a938908c92f671faf41bf8a50aee60fbdf781e2df2f9bad7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a0e8021483714e90f84fa401889ba8314e4cdff2af153d0477e7d873ef8e1ea3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e947be86a1811273c46feb22b8d7d2b4c6f2e5aebe772091fc0507ee95bafee1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "118fccd90596855f59357a51572f0f6697ae74e5392442ded661234b5b520245"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a5d215e274e1d6302eda8bf8ca792d08dca3928f4bb7e55bed7545abe50997ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ae4bebedf96f02de9ea9290203f3ee998757a8c99f175a26760363e7e68d44fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "68a7e17ea2636d27af0c60ace1946af5ce0112957a8b61b81c409f8224faf6a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2af72280719e591e6391dbbd1813d0bd905b83fef1b220b1c2b236b3bcec114b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "55afed5f8c507ca230aacace0d09041e33de70a344085a46f0458acb31cb12a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "89a3f57a8d8a10cb4227fd7b7f74c0d52a89d8901c63fb357d1ea4289da9bb4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aded6a545e6899a0314b79ba5475f32c6e1fa14b202e06ee12ea98fe09d73daf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "949460a128048b904a7ed177cd6c1476bf4b7a28d6bc48c2a453a631ccd45f1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a3d2202bab2a9f8a17b489f03f0abd2671340edaf6602f5fd03903cb99f4fafb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8933d734dd45626d7bafa6adbeca2e68e3f52c6e273e7336e30fc84fc426970d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "032137dab1f3c04b2ade15c7611b734428ea651fc4587e88625362322d3825ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "49cc0d06b1e9c857b1bdac65c7bfb676e97b45195a63ed00e0829fb6323d3f33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "796472e3d914aa96a078a742a1eced95f4f9267a35cde341f091c24171427a76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e8149e322cdd9d632e7ff73be53e81206aedb4309849fcf2469c485e140d7cfb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3ee74ca8aaa7c0e5acac709271eb419b9c1e4c3dba0760efe18387ba74e32896"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d53ba70fd08175d040642e20bf63a0ef3312acc35578e3e68c66fd662d787f85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a73a80cf90d3fc49282eee84486ff04f2364ee7ae73b5a0f30c9f37746a88cb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f3dda3ef367935c3869a4670b5f62196918474ac4f934f50f68457ce180cb6d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9b1e93566aabe50379b318a9b038766ee76dab3f7864a840d5f92106b0b7819d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3819c48577cca681b56c7489d70fe65355251c455a1408f2954c4941a2cbd301"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "1ce9d84993ec699a93fa050b539216d97754c91a9652a3c85e9b1e97e7653616"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "055fe18b7bb84df255b2e7592d93755919e01ea0b6ce5a3086984d3e50016ace"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "462621fc38dddbe72c2e38f35bc5b1c55ec24be31652fc0017ebfa683d94c4fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "8d5f66cc69920fc8bc9d85fe1b8516770c20e6ea7ee53815146aec0bd4d3be7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2856aebf929f2e54de30f16feb1bc319b9646b2d7df38672474a74c49d916902"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "af4256b3f7e994fb00fd52117ad55393b4ab82b0cffb63faa597e6280e0c0348"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f7020e0ab6963764c0180f73fb062ad927cb49d92884d56b5a43e801ccc2cc4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "9b6f34e93d1cd1525f170fc454562b20cf42b0e949da2a80cdb84c3f6eef2864"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8ce048d992e2009b7c678f94b8e4316e61d6b9ff1c25a69fa3fbc302e7376b1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6db610dd832e2aa005701d0e1e4d3803d1eafb589766481745b31fafeb902697"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "9d29e244391736636351b6c5366b9dd93ea4f9fa6cd3b55dd99b1d26d0fd362c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "377d35c51255b4e911bcfcd14b26232d1896cce5807f03a5cd9464b8e09a3d6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "41ad9448fccabfe9d4dd93f77fc8777ae7abfbb1294744f9f51c827f959c9456"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "6d50dfc1a585935d7ac77e6b930503857213231a59eec09f895a0eb59630e2fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bbc5e76b1f9ce233c28a0bbc5c9456dbda4ecaa65401c45e8c1c83ba8bab558a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "14117113ed4b27a0bddbcaffc5f882b3b0642ff57e53ce26081cf3e85f67a148"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b9f9fb79b976a8b91e86df2c518a24dbb7ba3bc79ed1b2332cc30defed5bf315"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4d0a6bf90ea174f741c533410a333305e1a52e95b75d191be32f1fbe4f10cfec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "98642bf620216649b4193f84f75c8744070ad272fe2118b5049fd00b840bc7e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "73d074ae07a6bbc9762bd639daf16105354e204925376c295e4ec4650fa85238"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "c2fcdc53f3fdfd269ce1217f75699ee74d9c6c01558717a180dbf2a62a8bb2b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "6531d90fad32400258f32ed44abec7c092a688c640667e24539e9603067d452b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "61e03599984bad9ad30e06c2393e3e55e3f2def5533da56d953fe580048aef18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "94ddef50bb325638b3208a6841cd9de80ce2cd1e2a192d85eb540899a5ee476f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4525acd5ce4b73a8d8e41b46f4250fef31169f2cb10fd6025997d6b75c55879a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f57407992a4719c12765d60ffc815c1df053a8283736aa2fadcd944da96004be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "31f071a398507c7c50b5ae232754ac9e5204ce05295de215db8fc1d79d17b485"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "345fe477c7ee9f6beb38d25ee1819972dfebce062d1099e7a965ed7aae310618"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1b4ad4ccb485346340f67f266805900f098d2e5b0d7755aa224ca1b3d08557b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4eef1a499b2f6664ca7a26aea398ebfe027812658a98f97102a8dfd2a6607e89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7d5ec1e0ddb3d6eb04d908c9659ef2fe6a24506ac6c41e1d3bb9f4b8123c529e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ff19cb56456cb6cec656c9196bbde6c5ce17ba956cdc34bc327d6ff121471caf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "46f7caf32138c892f3e99861ac9bda7408875938504e9f884aff256380471bbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "cc18d570f7f57cd554006373257cf4fd48c99de3f89c9fc70b22aa36613c70e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "23244f7498c6ebda5578186c5e815edcfcb99f4ddd79335cb9d2eb3312c8e460"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f8947aa39fcc5a7bb18b07942b68d797ce32634a4acfa801263e0b0f5703f174"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "71ee9884dae947572b9a3448afd6a7e168755fef2fbae250115df2ceba7d20f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bd7862c97045bfd2c5f96b2137c14cc8bf7151a77459e55b579e0d7a606411fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0fd8dbe68f94eb7e7977d8bd393133719ce60e57f2ac36335372521f296334d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "89a245025b50028f9dcc49fa2b5870a5b07d2299e0bad0de77d1dbbf96e15388"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e42d6b67cb85d7779447e855b06be83e4d7fac77f8c43f8e9347a9af512c2850"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "00dde59abcb5c465274bdd963b1df17c68e446c618145cabc530036edfdd92ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "510e65d3531975e1d473ea15fc1b505ac2121b22c414a08b59d4b9d50ad7287a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d242f15ed100f8bb4dbd2ea66c9e3371002eb01d839f593753ee1ec50e6cb70b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "ce7b918772b3eb82cb45904b920a5ebbea79c5a1b182b3a5f72a10f23efdd8df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "7a364fbb089ceb5c47fe83a0409034d364ac3e700bc5e8936ebeafc3c0e0d06b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c62e39a96f13cdea43435b844f2541b5fb82604e594aff25a4634612bdd6220f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0a5a43237f5168406485f3c95cfeefe69b4e6206fe6b36066c64215a34290398"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0230464448474a29c6e93f98dbbacd49142dcde6414402b786c188d9406d47a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "e292227696768b4aab9fea506eff6b4e9e09b87a686f68ea5f078d3e68c78d25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "4700f2776bc75452cf92da563607eeda69370ef72ceb49f7d0fa2b86063b75f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "349e7f9acef057d42d3776b29ca8b49892ec84ae870b0fadc9866861aa135652"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "cf325e7cfec10c5aec9b12602a22d5a46370239e97b600292aa05c7f5f547df9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "0b824bf17cb880ea604b2630a34822b02222ce716796a8fb94aa4ea2e1f0a605"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "6f430571933628923e12739685e8f06aca18b38cbd2daa9ad3fb0a7734c7bc5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1e20c1c5203293555ce11750f232697b85c3978f188d155d663b6e3c6bad2237"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a3f949c8ec3e0b6e773eb4295af04d7db591b70cbc76a5113b4ad74a70484133"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "52481596e9948fbf071027ba2d4694f603e1a0c4ad96193cbf278d6aca4edb7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "aea0929fb09fa413e6e37053118a36add1529870fc843a6524220430d32e8fc7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "1c78225692b91d45f5281c58e82c13fa2149cfac823bff38ebb504c82c09ad9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "bce54979a9d8612d3923fe5782b8ee2e3cfffc9fa66d89b423d55db22b314fda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0cfc910ab0d3af4d657ce3c8ba0ef1f1a5956c0fc4d4bed453f898953410e37e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "28556b06ab0999b05311b6d0674aef1a1e5b722996b31199a5b9485426f51ff2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "9d79af6f98facb7ed5171568514fc6448647f2b096502badd93e9dd46e5dd5ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6b2043e95cd4a0853ed06bc7bb6bc9997d0c09e0d6d2390a94d602c584444815"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "ccf43f539cc2826683ba0f8ae9fa4036aadc8f3b1d3e22034d1896aaa332441f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "52254cc1ee1103e2ec2a59cdc3ec0d70b522b8f0c00a8bdaa7e24a63400ed2d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "67771d9f303c7b9d94c82e58945debb950bbba11f3a978d99558e0c36ac5b58f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c66ca2d155bd1c2091665e66e0334d9d2e932c3754810b32678adc70142e967f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "014b13cc366683a375bf17be377fb46708d78b70679b356135c64533697543ac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "12558c3daa065849d61cd60a898ec628d6f534d036e3bf4ab8e7332e3b3bee91"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "716c776dd351cd67b5e8ee25b897db14ddd049edeffc57f65a72914313a54518"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b9280b64f2e55ad780e7f6979ef331c95bfcd0d8a812e7e5ac6fbecd95ab9936"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "040e2c8ac1e981f10c54e98ec5e2bd3f7ea7a71083d099bdac0f025fe084c7d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "57f4297e9120fca30b5ca9ecdd76f47c1907f4aaf0507389f1efe1904dc79af0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d34d219c69755802658db545ae4a246f09c38871d97f9a3f586df725d786d1c7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1cedd3dc32b9d42a657467d512e7a29bb6e599b740a97ecd2f29e9abe2997359"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ed9b52f4942f43a3c60a07ec0fe8698198f19d3b490ce8a0cd9ef0346e3339c1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a4b50e8d74825a91181ad417fe7ac7744329f900d2dff4663956d727c8717cce"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4006f3bd0a67f75166bc40333942bd4b4717e81f26675fbaf979291105af2319"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f6f5134559f839f530b4fb2fe079d50cb9bb88f929999a807f40f690e2a66190"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "46d2b5aa49e60d16806e399a56c80fc91dfcd8459cb1c8aa2d472e77a221938c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "cc4f3fafd54e0ca939b233248d74df9cf27153197569de6b8d3b7be5a4b5c15f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a4af3ea5361709d45d3cd294dfd7c313de47a717d35477c9a6364b1c6f992ffa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6a10b0029052906ae855d3d09d35bfa1939a23894d122aa6d9484d806a85428f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d083d591ce9b729bee834fc637448e05ce33d68d8e53db10f81747f0c7be0f46"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "06c042c3805b15e7eb2fccdd2eeb00c0e94fe01e7f03afbae1368fc9b3b3149a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a3fe11450ee91f2e798fd220373505ab18b599c1dc27416affaada367a706e04"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "13329c46252e533875adb99f3309c2d360ee7e8fdaecd9892990043fda98b4ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "43ea5d8da8f19ec43e0b16a8c44600a90460d42300bc978608f68c61966f6f42"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "445f4bcb0fe8e34b7a95b131da7cfb87a19558ea57cc468095bc18c2cf641abd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4fa309bd092455b8550df9aab6162511772e6c6bd409d714e80541ea0fcd41cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a0cb0e4b3c8c9271f1c090e9fad55d324bd1a026e379bf1745c8f2f51d859fd3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c84b757e9034e6271473b3bd4196d1662367f363e897f0a8b76bdd98c7a9ed5d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "95d8a37366006f476ac95e0836f64b81a3b6926731eee58cd1ddae0b3c72ec27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8d6cd5089633c70232be560bafaacb16153abe7040c0075ea81bb3b0a6f26d03"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "881506d46e54835da8830ea8a712769df75858c759c4a844eb6c7e4338d5af33"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "36524d9972327d5a124283a3a51c5f77aaae1530fe4d092dc1798041e50ee168"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3f0a1d030fe6e3aa32613e2f3dcd07fd64cd05fba1a8ee93a802322536bd4ed1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a85aa00ecfde60adcfe35ce06c427e3a0db15bc443e51ed5fd3539732c720984"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "da91934a0388a3f1406c7b075737dc39bb5a2f49bac495d091125d016548f37f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "24fa1e92034a2468752c5935d9b52f3c2af4b969872c2767057ea77d24b7d289"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c680ebc342e2df45046f62ff16ef5c635b9681a8c403ea1116d8269d2dbfed99"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b0dd15f59c07fece4627f2229549d2fb35028b8ca7eb9b7d8bb6399ebfa09a3e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4cbc81c86a1519fbd3205d3a504b08fac1e160983c7f3126138c0969ef0cf85e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b030f7cfa4e5b5ddd56792a55ef9aafeb617a7c2a255e153113a37eced0604a1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3f73a14efc80a4441bb19f5471f1493596fd323d9e75f53ca4e61a34fc770d83"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "198aa44e75aca4b16132987d738f3850df8cb9adcd3c3e60ded8a17d7062e576"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "87f7f5c4610b238821b61add561b8625563b909c806b5c17e67230d703dff383"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bad18d25243cd0286e347d51746538e444cde0d12cf177b79f87bcdc4a680a75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b3a9901f287b83f10404c576f143a9f7a251d056454fb077f459364635fb90cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c39d2cd832260d0ce574984be35b154b13df69e0b9ca85c50443fe719f9e1b8c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a4e6ce0fbad7a144c8fa78ac1cf9978312984991bc5df7cea9b4083107485b5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "15244219de2f5cca03e87abfc861de6c63be49e92d2eb49bc2bc270ba1fe1780"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2735ad7c58ac4aef1c3f459f88b8c3d79996599f7daf1551d04e548b1b06770b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8ce83337380eb157bc55d5ab7d915ceaad8b211b17f4af0dd662d92b50c40a0d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c0634bb1b441737f25d7db8a2cdf09b55a12e24b662c3a528881c12cefc0a137"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a26f09c210ed88dbeaa5487fea765facb14635b5288ee540eb5c01f98b3d542e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d74dfd318af2fbaf8a1d38587adeec5c12e2db414f47242d5dc7b55bb4374c01"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "aa81acd58bdcee09879d087c611adf6cefdddb3cd596af23ddd1af404af322fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "98f7c8241bb482efe3413cdc43d5d4a7395ba36584562a250528da8360275fa7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e1b7b734ce8e94e21d8bfa178f8953b2877ed3b7eb86e4623c89343f45393bbe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a59c5cc4d37c604272408507589aac5d5d59cd188a37012bb781d91188334283"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b03fb27c63b31d60870659412629465ab3bfebaf983f354c2899d09667aa8c5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c27e514d9a44784097b8108a279c251d874bfff2abbc98b7978a4be4ee6c8299"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7bf0aaede18adec4712c42a91d224855ed297334372d18224b19f9d74dbcbfb6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dabe8e4c526b8ac0b729bd55319bfb34bf928b03a707b10d638a533681e9ec8f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d4de13865b118297ec8e7d84d7bd0899ff80f9b496ba2762f629f39227112f57"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "78f137c628c5f414067c0c54aeb5ad8243c932a045b22460a0a552dae4d802e6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d47c1a40d3bcf43c70efdf3702b300011aa072a4fdca349b5010feed111a28ff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3ff85b44bc302cd7067c77f5bc350b6b13d20bff6ed655797fce1db15992470f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5e5e875b7fd01cf4c80ce29ac98ca809466ae2a83b3e318731b9588587d4ba06"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "631c9bb838041168a42a55629314c2d8c5b0134ceaef9adb8105aa3f5012b4ea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5420111e9f1aa3e0595e961f2ed081fe2cc70d0b525fd3385db90270a9bd082f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "121944900920259596d609a973bdd3c0c79abef257baf9af0d098abebea56711"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0cdda369e15567f9ef1877d647e27714a4fcab009157ee0c8353ef05095acf42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "13b06c84b209670bb3af33e244f46275b43b57eaf2bce0742b6e47bcfeb830c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c288821a85057b1d2b364ed3527b68b9f6be71c086bc7e686168e2ad820bc8f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "537b8a469559a2c1afc7454dbedde112d028e49d5cf5e7cdf83317149008bc7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "df7fbd32de4af4c68f67f37fd87882cf1b9938e1c4183859a00bf244050c29fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "737dbbe0645bc56e16e9eb5fc3f65c42c176e0fcde498eff9ac596f4aebe6c47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7712f18341d5ea1fab0fd28faede4c9560050cc5cbeb8421572afb62e1fc437b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6157fc8e64bb85920b253c879019abf3666213468cbf1c3043e3dd8943ff92d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ebea5b4ac5019b13ee4c36921a13d0eed1c31841aebfe3300858b90569a7d95a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b4ca011284e7ab77fd8e2ef40f7693de2a4b00ba4e904df5b4d6896dd84d0005"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d6dc9669642e94ea1ce4a80030a077b4e2554498b123e43718e1888f6626fb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f5a1cc1e0d8f87f4d3549a3987edb1f85be0208791b232c9858c8bc2a7d800fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e288812319a9ca4f454272881cdb8a5626e341b7a90b81d9360ed54a95608861"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b4e936f2945af2398092666fd58944ecc5d73aeb49082aae34fd9ef20f1d3829"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "fe3c2432eb17fdb3ff26f6d5126b9701047b5fe31609c3b467c69d0d73bcde9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "492fe934395225da7186a216574bcfdb8f2aedc0bfaf26e7ef46942097917a61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "08d311d42ca8688ec3c65d21cd0efa59580663ee4bbf5c0451aaad166b646831"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "9088d5533db946d8f4b433a46f30d6c43b63521c51780289c72c0ba7fafec120"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3d55e4dfbdb3d41cf8651abdfd4d2832bbb6c4e979ec09b061c108b4bd5b3b3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0ee21d0fccc92f1209ca66ac0b0f9d582b88a2fb97e8d31f28b0581879feff01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "167f3ef37520f65af446204308256fb6b0f8b82c6d83668512750542afc1c993"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "253763dfb4765f1d726c36ad781999a90648e76bae456cb7b5ac9d0a7696dff7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a819f0e12dd79273c8bfb00a478da5b8efb7212351285a4492261881349c12f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "eebed9c4d22c19304d08e43867582019f4a7fb7c58a1eabb094a8ee56884f7eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "f300269080d4d6cfebd422fa5b97585960511c59c1545c142c616a54a44aeb22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1d972296796493ced2fd305d76665ef6601753811316b5d102642d71b3cf8cec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "96541023a5fed4dc06c4549d929fdc88eb7b1b1f6eb68b2a07b04fbabb14ecfd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7b07578c415df455fb0da9a68ef3f23ea852dd861cdc06a4ecaba55daa0ed074"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ee940b385c2ca63ed0f7ee140a08cb73c9666730bf1ca737616982f82d751e7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ac208e7844180dde6b807c26ea8a9dd52cc4c924763f3ca2a08ed5e09f26d9bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1a59a085dc1ec225a9b6f1763c519655d60007e4e9bdfa4603283b9e2ed3a624"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a433a72e4438fc7b458b3af2e5916d662c3b6adeab6abaedd79263b9131b81bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f6471797e7a75fd6e8d1c36b60d76eb6ba2b4ced9f919c429eec7ddf9544bec4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6428b72f4c0c9482ffcb0fb184909a0bdc613c85180a0548ce133c80ef581ac8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e778161256ba40452f8bd567a465179f6a511fcabef304322be59c85440880b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ea5366e15c59714aca422b674fd49c8569c8b03ba9ab4577df431642707c5b52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c908a685486a9d791b477827c2e7334dce598e6ac9c6ea40ca6243620c84e260"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "af8cdbe85e315965bcd1db8b9f4f0b6426a6f4e46ad27e9d80d70e1099046f2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f31b9c95825ec9fadbeb89da139418e6e2bcfd5e19b9dc8b75b603e3bd917155"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bcc03ca3b419b03d4e06a66814a348f37f5b90906da66754c6ef371ff590c9c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8d928dfade95201ce61515eec612bdff1826310b7cf71b9506497c9c0890cda0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "5eac742cfefa4e02ce5212c1ce6bab5417981b0aad034ebfd30049e73fa619b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7cb95117227c051fc01b19c4f48dba79a885407d64206ec5e1fb3f62c9c45e55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "11acddbbe324c743af2cc13dd66ec604747f5f64057b8cd43bbd17cba4b4ad7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "43a74d6be46b4298db6fcde5fba765171c25fcf66dc98762d5470f77c78c3b08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3e4e8c9c305412cb199e29e3e65195ccec27dbe6e9e18c4091472679da740849"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a475eb6369d32da8f8b71078d2c973c4d4f8029f5c0d9e51b34af47acd8ffa34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "8444b4640f42c013d9bb663d8cfc4c0a0c3a6b4c1118f3273c49a6745dc66608"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b8bb328a95951ebc52bb0bcef67cf86484720447df3b463eb74205ad96ea963"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "48e1ea6c085a91f5caf238fb0c403b3891e78e4f504c814fd903ca2cba33f363"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f7e2955b61353e2fbed671f6b4c8adc78243670bb5ec67e0d0b8c8028a1c525a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "cb875c4affa429825da4b5b8570b6ac042760a4321be5f5ecad7cc7d3b931116"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b1b600e35d24f05771c81826d6871e555467d33f387174c0574aa9e491be6e21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4b08b3991644ec7db1a43abd5c3c975ff845a8e2a0f6c4aeb4637ee4a3db171f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ae6782f9943b35917f22180aeae819b3a4f5bf6784950286fa1f4d1b466dcd34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f7a9bc767bfff75aef6b951c4ec8e04eaba77f48c4442cfd9499d5f78de756de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "39f928dc4c37b841c6d8a9a80f44c0443ac44964e93a7e1016391ba72c81bfdb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "57b6eeee22e922b80230975826bfecb2cea9821f6a092589aab1b7b6ab3f53cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9fa652572ddbe1edeea3d9327677d9b4ed7e05f9df9fc355c0345124c109624f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "da140e10a03b9402b857e5800bf8225d142ad73992917e1e59ba49073b967264"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "74caa71499c7c15c0e2458a48a81ff99dc9312c25b1c3454acb6073f6541f813"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "cd7d268b0ee3f7a9f76853edccab3e3e74e928e475eb4241b2e40ec29cde6f41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eadabe30378645f3742374d372a90926e5905fd47399fa1a3d3eef3e4661b7f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "07b9feac7bbf583d0b426bf2ba2498b6a5bfc591f18a4f6c922cd61d92056d7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "330dbe5dc033571716d9a9f1872e6288070170eeefad12039138551f31be548c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f8d0fafbbacab59cf32534b0149ccb18b430d8da5090b236293aa7c3006bd624"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1342170751295c207452676761be0d714853e5f27f422c6b3d0482407681a0f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4b3b1f62d70c79cebcfd24b109ae408149fcd76b6159c789afa53dd4a486378e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7680cc588b24e02ad4fe238b9ab2f33510fd3d8d917fbc3ad7e8accd7401f292"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8cf6e91f92fc8c13eb04f006f24e51d5ec8034ad05da282cd1a48477c508641c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "60ca9b1342c5f1b0a74254d4c9121704074772ee1a897eaad3d3a222daf4b712"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6c0009b007bb772719bfc54f31e99a8e31c87f8ee67229ad15608d19381deb7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "362d4336ef7275fc33aad7bdb8a691ed9be0dfdac6b0026bc81937ce23479ede"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2f2b7ce806bb5e23816be29897a4aa9707b7330ecad0732580a0cc7615a6c1f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "eaa842ff70eef179819f91ebd1004b1dfe3e4d50c0aeb84a80c5a46f5e216f9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6a6a86c1dc821a58934b0b254a0b5f711f2ab4b56f01db089243c7c52cec75f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "18435952674587abee19585f09c59256b31e4282b77147348b73666a7abd47fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "72918d90cd959d9e59be4cada6f4909897182b6161b564ef5d5292e00ae6f2e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "595528032cde1ca2e194e175e29bebd16497bd2e229cdb50a5712ee71afdf1c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5f757c2ddb5251dcc9e11b142fcdfd9674716a9c534e046b2f2d07c4038eb1fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "774d8632cf2eef441e8ded93f65fc891fd97e715d806202f59f05bbfe32dc356"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d30920dc5fd1c81ed0949f2c903bb214475d83ac62bcb57cfd717b605787c430"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "3dc3e2c25339fb2cee1c5645c4b0e6cd913bfb8f04cba10233c59cd172a22d58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f1a33d8b27f2ffb8f4f25dfa06b21307bdaa57c85ac1998bca9c028caa3007f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2f8a31ce3cbf9caee0cd240f603e1d1b0c79362158924285c4937a1457345ac6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3bc3595e355277ac97ea7a8df9567fb1672e45ef270c3a86d4feb807f755800c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "70dcbcd549b5440d1e2d282fe59ef848a231da75660bf1f32d61132aa245bb79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "16db0728bfa442868fce3f5f97ccfb66173e63be52efe6305e7ff940736f8604"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a6d6ed2b82ec9dd4f7d0e42cde476e6231a56c82db7db7d175bec7045a23702e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0c5615e4150ba1e964dfe6f49a1169bed5da82c85de067efce7ca441b5af46ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e44c6135f8e9fc90778b3cb67f8727ef4a2cc112403a86e3113d45b11936a779"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eb773c9d961383416a5ead0154c113db66221a41d51b65ceacca303c71b4c49f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1a02ee66d97d72c9339342b44dca24f173f9b189eefa7f0d10ab9424669b7bec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9f5bf6589ed80212fbf34c73fbd7d0ef3e72951b6e455984fdddcec5d37e18dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6bfc939ca58e12338005e087fd0a9d4dc235440cd767020b5bae339834d5b371"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1a55293ed2a7a4405527cdcce06fae8f822304e96313aee4272e02381bdb86a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "42f9721e1b08cd92a957a92fdafd0e832ca9b998d4502edb3d3bca44de1a0e46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c51c609fba31f3d92496eb249effc35bcadb0a9841dd05296349ce2f23445cac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "36da0a4f5330b4d701d0fd228d15493d4733f16d25abb93467617a653b2df913"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "17068229ee82f4432f966772141ca57a4f3b744c50a869dcccc793b5a8689803"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d7393e386cfd961f392817cb2eaf3ae47f8122aa5065e6a133d502b016619a85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e4596a023f1381d0ce9321074d514d4eae3ce45163712db69fe97e191d4e02e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3d68ee08f6351d2c06fe006cac4ce856d2354d418bb805ebeef96765f3d54162"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6f0ff7e6708436d39d49428a4e206c25d86699123dfdd4dd7d7822363d41a017"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3e68a25fbb94f2552de54c76e356397c02ed4daab8fb86543537d50eb884180e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5e791b1cb2e732b78dfa3f0fa9e4adcf1320f0bf657aba98ab0b8253a732a5f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d7c58803f0b963f2831a8bef361ff879d6c1cb479bb6185ac7b5bda5e6f73f23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "88a67a5e5d85be21cfbf9b1881e8c3a6a457579a395a8ce3532d015f300f897c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3db73a8aa895125ba9cf83b6ef9305a60c5c9b0a1ad59c373796e46399c2f405"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "102abe95adef57f01a67cdee14b4f4eebb877c60ae10ec5014b7e4aeb9199b61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2fdcc196a2489c74a17daa8294486cd0da319b774a61ed9433ddc2d5ddf833ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "81eaa75bb9bb080d61196a70fed69917475af3ddee071023d99c86473ab357f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c9c4ce0e4126a01dc20c16d31658c4e4714a4eb61f32d6fab5ce9f224bdd91d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5a65fd4b054b11627490ae2985eb4e763c825e82b63cd6276efeae8e2b0efad4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a6ee2eb312fff5abc573bc19c174ff29546d3f2b217eb33846049ff9df34038c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ff75a4551de29d030044053dfe1288a3e198828165d69b5d7d8d1d55be2e4e35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "4ba37aa07b0456238de4a86bff379c1584bacdc02d27c41b3206e180218bd339"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e77deec8eb06f85de0b5db21906a0f2fcd90256c85e5c9d45596eaca90513c6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "411b60e813b8a79d76797e93793ebd18cda3428cf629063f0e4bb9dc7499fe61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1db6243ea71827991bb7d34d5d86e2341945c1b8564e21da420022b098e2e6cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cdec7e69b607b1a25a4bd6a9e7fa1a6d203a9a3d154cda37ab9bfc22cafa65ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ed34d3212498fca95d27bc0cda42992cae8f2abb675753ccde02d7f2ae8dc2ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3abbf726eb5117db4c5b52f4cc4b493d7ff8d5c4f948131572794db823effb6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d90e95180470b75285c9cb087836047d6681cb454b5c2e03b222180c2d2b4a71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "75493986019d008d8e8f9343ef43eaa5eede07e070f9f93e76cdb993cd991c05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4d3d0a33384f530bfba85f90ff6ed1ac452e10761327416010123c9a0ac06878"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "28e0723d861786e5d2888f2d9055e7ab166e5de1f4aecac23f535549ec0decfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b638097b42a40708fd3c7ea3d1cf96c87c7bcad7d6485d11c3c8063e79c4c0cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9d5b1448f19cf1e5c68c8cf4e55831a32e0bf9f9fc540c46420abe9acffbd600"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "54786813ca6f740c024314c83c9b77d7312ae5ece94b6835fa4effe59cc848b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7d567923445fdd3db80093996cd9c846fb692f01518c1e85a76305729a80ca28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0f9a9fd3a52c47d7d13cabcdebf1db82838288a1afa44726631597385c0192a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0772e5b253f1fc8023b3e26b5cbb7ee6838fc81348ea3ae61623d7dbce17baa4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9ce65783a07e06a8fc317fdea6afc3c27bc86980107b29466df1b7e179341cdb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d3d3001cd001ecf36eb0afa7661ddad4bd14bcd0dcd8f8f632ab5c5b79b319a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5202c7126862721072baa062c66dcba39b9f4088d4add717243dd9df902deea4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ae839b3bffeec2e705bc7c2072fce54fbb7ec5581c28ea5e7cca4a7f8ddd6b3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "56acc7ff5dc29840040768b653bcafac1b17ac68a02b728de6eae2719e352b9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "52ca49ea887918e26dfc4331d27302383669344d8591ca30626edbac65399cd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7a0b11902d78e200c2dd06255da60dfd0503a920bad0a1c46a8b4fbf218905a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e2b72e5bf775999706403fbecd97196de700bc292e419bfce2ef8a2d2b710564"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "112c5e9429c368a5bc50047b60620bc5a39cd54697588917e183dd2019206fab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3a68615a0029a31405d5a75481e3fcb253a7dd089b2f7410541a79f9ca543131"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e346e17bcaee3f7f0f94959d6a46e81aebb77da1911e231faa28c9c447cc15fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b5f999df57c3d73ed990f67f0e5ecf8e76cbb8bfea0dfab64e27ed220f7a3541"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "18aa8c12b11ccd14aed5888c9ea3850b051a6e388f0afb29e2876e3e8f73f565"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7dfcf17c3908460075d0b980068c3bbc7ae06642c46d065d1184cdc83a467153"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "53a5d78eecc441e82ef316c41b3849f56ac7ef41eeb8eee7a94c38f02deaf558"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a174c0bf78dd865b1f8541e2084e22de3dec004c3e298b09be1745049f8c1e95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "164d6d374ad56930b2c7e51d5c87a1a9be2aa9023d821a193cc13c010f8b5a3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "813981e9fded6ff83642ec9dccea43d360aaaf0c7c7c473aa8106cf31dc3377e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d39d804faaebddae25058d77f82bf1bef986b3b478a1be38acb378e4aff9dd9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a780faac885b0492839bd839c0067256fdc9c228e5fe5896eb4f67f17c212509"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "781b344d97dc47bcf826074834d68ca635327d3ff2b94cd7ca29c492ceedfd9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f51dfdef619e91b1f9fb21da28013a867639b3dde67b753bf1821057b4dc5043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "20c1d411aef60f15fd91b53320b3e766049c921660cf05c61b5616f8744db74d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3ffc40f1d83dfcd0a72bf854e1771cfe4b1cbef78c43d43fdd2686efec56d76d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "aa2e08f46d094a31ed043d117d024742b1f38de33f39876d6745ae6cfb9fe5a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3472e7d4bc8660d0c128ad5c28e78339cb1f14d33c1509e6e30e45930021f197"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "80e06eddff20b938ef26570a4bcb21d6160e91cd3c6176df40438d4943eb0810"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5c3bb1c45dde92c0a9d6ebbf377ea262c261f9a6d8858ca3b8248c1e0f2c18f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ff49c8a47ab9aa58c90000d07ba380d15253b3c37f890175ccdcf84a7dc6aa3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2b56f18ac2b488e777f0e09c03d30f8eb2d0432932d5cf4fdca333558e9517c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "531ee87a1c419a3454190d731d2d3034585b8677560c2b3614af19d2369a2401"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6f72f17798c3ad1ee4d343a51f54f4ab0f9487417849b0ff4513611161fa856d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "daac2613695bd70e474c651bc0b87248684c5ddc8002d289fc2f3f2b71589e4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3d3fd856c651a5f3dc9346f99a2ec205b2e17a9ee2c5dcf94530609b35b05eb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e9d83d6f8cd07ddd367d379c06ab11ec3fa69a7f804287a269ca8f932b59b6f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4ae7ba726c2e0f78a75db067110863058e254559744fbcb3771f4a90c00c44a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ee8807728c4b6efddbf344d190165784e98e12afaca705656f951968051f5980"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0740c7143c47b566f89587ea2ffb14d23b8b595b82e5fa85fd32bb37b683e10e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ba01bc3f62866b8ba40b6b5bc2cb7042fbd7018d209247b755155458f10f8996"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e3f4967720b80a18d32523104a5ab54e86135d4356bbf0b7a1f5d61ea33ce960"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b7a45f50cc048c1ce639d1d06c0e096d2969168cd93806639984ff882c3a0ca4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9858a705cc430a34db49f9a593918f215ed160c1e0316ba64e6a3c3409afd4f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6340d049910b4c39458f3f019ea2796dae209517f562163a87d3619d7017445e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "097cc6c3e46827fa777095637c92e8eaed26e2ac6d89c767812d8260bcf058fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "23fd5101472b892c202027ffd896df1a64e376c0456837368fc76b0cc3f58f31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "7108077cdae3a4c263cb9984bb2fe6faa17211494c522379d908c9b9e6650bfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "faa3eae314c4bf020f5cae4d98e1f9403b933bbf5a058c4d149e99d550a7500a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e9508e7682576cbb4ae4c06ccca72984aa829a2b9e160131aa41e5e276f197de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b74c964549c41d7badf3bc11fa523763f2066cc2c1649376eafdb7f46b116c5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "bf25d69a6789c672b10096a99174aa8f7d569c1a3a7a14dd1c21b6ec3355ff82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f355d0aed052d4170fdf1afa8bf574e062c908a423952491db408f48ebeac14e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "434481d41a2bed995f53e61a919eb6cc82096fc8d4ba3cd8224a986900c94d4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca0221d7444cf69625f3949395c7d27b499cf9825e07e0076a662ed0e2e0afeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "01db214dec9d7c49b266c7dbbbf58081eddd599c33eab19a25327c01f460a9ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "cf3424c22d9a358c5ebc8c4edb3bcafc2f1bf42a6eca9373cb20ba46958ad881"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eac6933e97f38df0f7d111d037452b7ee9c3760adfa6585a71f4b67b224eaa47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4cc06179cbda999bfb9df391882a271f8288df068428802dc9cc541f482433d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9a5afab1825b9dcf1a8ca27a8cad2f7084bd982965e2b4d03eef2e84b5468504"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f7de9a620ed44170f9836167ee8ad946d17c753c373f00056d6af421cc12a0e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "eab06926da0b1fdaccfe472247cb0c1e299dc76e9a46bd54825a64f4c10e713d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1becd42ce925ff48652f70395f5a4376f06f7484a765ee25631976c7c871628b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "232072c2e65c3f9eb3ca23799f53bf8aaa6fbcb96fe5c4ac8ba052582f51c8e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "e494709f66505b151baad12b5373fc6e4190ec8650323a46354876d48cbfff5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c7c75b75f3593c90e81bd9c5b99d8ff49c614ecec6f90245d4704f22c844f5a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8a84e00c424ee0dfe0ba6364f040b289b2e49cb6f778be03edf6d4cd515c9488"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d34a1595ffadecd1c16e5ef07867f63a265d0a49ae9ec3027b70eec9459f849b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ecf448428bc76541a4ec21c0163784e202487af8e9f0902b78835ae301cd3610"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "944d38233093695d64cbb823bf88de167ee35cb9292b9c0490d320ef267ce65e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0dfd806794fbf6571225fba3c80ae71755cabf05c3e70cd6fcba02dc3ef21422"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "36ce88f81b0e051dbb2746ca0e5f778decfa43d1905ab6738e0473b9ebaa1ee9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a7d8d5a69f59059dc09c6154e78b8ce46dd5502b0c995d26ac21002301f664c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "50ad5a5e20060a21ea8eedc77fe60548a818bc6b67816545baa2c254cf657053"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "22c11248c4e4839dc7d287a24f3254401d2dc08e260e964be7fcf81de32780f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d77bec9b9cdb03b2e4fc1f05237fec143a535415e2c5c9bba17b1de87cc322a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "95d08a8cd138cff5b4831ea5d6f28a831fdea9d6a926043f33063f5c0ab5aeec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "531100b98240bf30ae8a2c1c466dbe32f5ae2d3f7141260069586f52f7c5fa8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1f93e05c0396501a4d5565b6606e184554a135de7834c4327dd9f648ea188a0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "46ba688bfdcb74ddd04d6d45ea6db8b183329a43a1409243dad92f8bb53cb89d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2aee39fbf1a3df6b2eed385fff8654d33d85e4f48b75bfe303d405c846d1f30f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "27c5e5f634b99655584f0157d7c7fdc24f9b4a01012514d13ae1864be70cd515"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "39b1cb1f2c57c58f99b8cb7489161b4b9f5ee808fa0294144d341c6be021e41c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "53e00061d67abbf692ba40d85bab08f7faa6316b75db2ea599cdbd5b4f074e1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7f789eef79bde3dac5484f7d9f7f41694dccf11a67176123d05425582b25b9e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1d0713f0dae933db052fde5978c4a67545df2a3d370056ce0abe7132b2f67655"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7b8fe0a585cbab663ac7b892e47bec97fcea8b83e1cf926b2ab58299d6a0ae9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0278376052781aec9a74965c560cc41fcb8aecd361963ea6d16e6f3db5675372"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "63a9b58843c29a7d7e78e7d0ba71dd1833a34fb78624f295e6423a9f37098fcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d918f91222dcd5695ff294718738e3564eb9a1dbc2d991cccf7af7bbd6922deb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d40dca11400719eba449d6579b7a9fe5634c8ed103e5fd68ad24451b0592cef0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ee5dbdee555f5a17f31997cf9e7eed4e76243699105c19313f20d69ed78ba744"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "46061887c40ff451926d7b1144af2f455f05b3bd7994b6a502e1fe9419438fe0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2bc546fa4fed9d55e0603a5406b67a2fa551a359c144c8df38250c22d257ab1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "261efcffd97b7a9c84ba230b48ef4039992b0c7a48c2e64faa662d9adc0055e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "aca44f0a0fd603bccb3ffe69d9b98fdefd7420e9d223d1f6dce445d6b77beac5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1b427e2859949bc2dfa2a92d2c08f122b6b69d6657e15c22635cad7e45e1a8a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b160dfa651387e36322a5d7f71aabc18604a56533b50201af30a70b9ffc35fcb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "56578513917bd35d46a44ad40014a145856b964aaf59e49c8d0df03ed61dcfc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "79fcbb2ba6aedcdda310f0ace9becc8e5905b2f12019cf19b3803e3bd3ac6af7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6427d8984d1755da5064824ff8cda1bbf2c8b65c50749af8cd2c1e77aa295ff4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "affc24529499bbda4d4e2d69fb6db7116ab371d4202f1d8a18fe8ad9a487e4f7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "55e1e872f43f6d751ac222d12a3ebcedd4a2fc8cbaefd5b341406e031c5ae2a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "27ed0b24ea29e20c794800735ab19a662334a99071a3759b5a083c014f9f4232"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4f5395c2ca8480f3c1ce50fbc28a30da662d7b1aa4a9ef4d33793aeaf9d49ca4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b5ab6c8e741423afe5d48a7cb8f977a4445faad9be6bdc25759be9a3dcc006fc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c8f1d03838b6ba2ae7c28505323deb6dec29816f8d85cb4fb389b8a6289c076e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "fd7668cb9f7a9030f3bee71b7fe1425306ba088d7cd6731fc966566ec2e40c84"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9b3edbb486da1afcc4b5f65f8d5fa6145a2d3cc583dee749bdf069ebe1201ce0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5dc9be506b8a58849dfa5e68b69e3c3ef0f6082788ccb1993c21f21c59925567"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1d26d9bcca9fe9fe2dd7dce24a59f7cffa9b5bd49cce4449bdbdb80ff6ef3a87"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5415438e55ac33c46c1efbc3d5ab90e1b1d3e9054ff2e34e8e5a80e5e80dcc2c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f8586d8169d57e377a0eb468e5c1c979043a796c69e327ab3b21c828c260273f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "2fb3c6d9a2fac93e8e3754dfde3d383696d74d4dd3763d5e880ee6f6c60c9bdb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ccc4804faf569dcd76fcf2d10e9f5fedd0be5674a8e3ca51710f1b9e30ccd94b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "582311db92ccebe1a8bb7d0d767e1a81704c75c19ce6de3f0bc6e0d523f1ac24"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8a6d9c921b0fd1c67c8a5b3108619b2217be6381dc5bfbe6ccc96742ebf35070"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "f20aa5b22ef9edb1e4f33f1a0de3327a2be44bd16b71803e16e8e952179428d2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f54778b60953659dfc85cfdbc5ad61b9ab3850b7551c2a66fd19b8e36fa124d6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a68db57c3c923d6a8e7d78a6ba0b65ba102fde748fa5d3f161555a69725434e1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "989290761670f433260eb990656ec7c05759798e64b30e37b602c178f24cdae2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e1d1a59b8ba6945f21516685d1cd18ee6b3d481d30819cacaf3adc883b5b96a2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "94ede31b184c06e767931f458845efaf7a13a7c63cfd19961b2895770c4178c3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6021a8bd2d53e0daf5a5fe7aa506fe729670b3f70d27754f7505b40f97913932"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c1b126a1e183a6a6c26c52f0066822dce4d32435833679cfce646c00e22f27d9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5d78fb41e66dc62efd3728cc096ddff333c50f8e5443ab240dbc9818f1534823"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "025b4d21bc251028898fa8fbd7c5a7a96152997740996febd237d32f647dfa6c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "79f63ffc5ba9d39dc043168d2be388c01a0e75adae280bd3d69c35e13a776228"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "58e01788926e903afc96dcc98a58c1d1ec75100d9afeb31245a15f2b963e0679"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "cd404980bc7f177c2efc9fbf23ac51f400b1b8bcab8a02e39c6e97a049f7c471"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "14653659bed25ea83daedae56574c85346a1665b48c41a08feb6e31b33003c25"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4c29caf1c6aa8bcd3cfac8e809a5ce386b79cb7ff462ed7f0b77afc9ac4fa73a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e884426ad2305853949f59f3f73da8fd105173494cee8072bc97cbb48601b449"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ba6ec6216e94296e229870faf3c9bcc4384966d5e374032ab40e9aeb07882e44"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ef8f670dcf41e721db7285008adbdf64027944e73a19ad85126c5a748f19ec35"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "54486aec2ed5115c3722f6aa35a49348742b46d25ed902a8f39d385bc5196afe"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a62503410ddcbdd27abedd6e727ad3aecc4a44858751d4055580cb1b8358e4dc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "75bf5f2e04865a16ace15ee1774b9886454a1186a14751ad530eeeba21ca11f7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "091ad4191a16da13d5cc79e9eee8f034ceb3b6e5912639a406ee725cd2d26933"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "36943a02b0cebfa8154ca30a68383027bc209f412d68b15ac4c32a95120d0d53"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6fbca064073994fc9bef476301d4aa32a825b4de0d5bf30821fc1647bfee652a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c52aa32e56a5e7288ce74f82c3fb795cb29e448a7eaa467e6155aad5e49bb622"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "aa3116224595754aef8a6c4d047d4a7ba147f3b71c43ab99e1590b628d413f84"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "25668c95af705c472d69dccee0107de1777cfc2a3d0a3baf730b26890b5232ee"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "53c5b249765658fc3b14bdf8ba870b13a6e9fd467b2ee637c162fa5f1106184e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "175b34125231a29f00ee05fdc17b05c98503c62b5d0c249b76869c94ecc92a66"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0252b9c058f55e1940ec8314c0c88ad09c947c531559402503167337bae2e2c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "491ecba7d8c114710c949cdd126a020ff2a8a038a220858044dfa0811394dc63"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "397a76015802b64c7be769c2da082e380f27fd2671ffce505512ea3486b62807"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b9fe356ba40304077f1d5e6a0ddf7c3a367b7f4d58862875b60d0bc97984fd08"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a6400d4402ffc436a632b694159cd80fd5ccc33adfde50c46acde59957adc629"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "70a36c3032f1252be2bbcc4f08c2a0bea28a5c164f8cb5babe7f850cf7a5a28f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3a2cad1c59ef64c28f3c94035f6d7deabe99e7ea6742fbfcf0b6aed53755b1e0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "49ea496f19c20b582b6ce3fdf1d1ad3a95186c8e067029bba26d8aff9803be82"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3dd4a5852be21c8a21b060e1e18ead56a31d9f65c100c81da9d92c6977048185"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b8ed517d527723595b3cd5aaa405ef4d79092df23e41ae7d8302cf323d968d9d"}, +#endif // EXCLUDE_SM_100 +}; // clang-format on } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp deleted file mode 100644 index d6e0a71040..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58892823173fd43ae549acccc4821c4eddc1605cce202489b0d1f425ebe279e3 -size 1573155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 4eb5ac5266..7d4219baf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -703,8 +703,8 @@ inline TllmGenFmhaKernel const* getTllmFmhaKernels( { #if !defined(EXCLUDE_SM_100) || !defined(EXCLUDE_SM_103) - return TllmFmhaKernelFactory::Get().getKernels( - sTllmGenFmhaKernelMetaInfos, sTllmGenFmhaKernelMetaInfosSize, dtypeQ, dtypeKv, dtypeOut, sm); + return TllmFmhaKernelFactory::Get().getKernels(sTllmGenFmhaKernelMetaInfos, + sizeof(sTllmGenFmhaKernelMetaInfos) / sizeof(sTllmGenFmhaKernelMetaInfos[0]), dtypeQ, dtypeKv, dtypeOut, sm); #else return nullptr; #endif // EXCLUDE_SM_100 diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp index 458b6983d8..34542a1401 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp @@ -460,10 +460,13 @@ void XqaDispatcher::runImpl( tllmRunnerParams.cumSeqLensQPtr = cu_seqlens; tllmRunnerParams.cumSeqLensKvPtr = reinterpret_cast(launchParams.cu_kv_seq_lens); // Attention scales device pointers (only fp8 kernels need to load scales from the device memory). - tllmRunnerParams.outputScalePtr = reinterpret_cast(launchParams.bmm2_scale_ptr); - tllmRunnerParams.scaleSoftmaxLog2Ptr = launchParams.bmm1_scale_ptr - ? reinterpret_cast(launchParams.bmm1_scale_ptr + kIdxScaleSoftmaxLog2Ptr) - : nullptr; + if (mQDataType == DATA_TYPE_E4M3) + { + tllmRunnerParams.outputScalePtr = reinterpret_cast(launchParams.bmm2_scale_ptr); + tllmRunnerParams.scaleSoftmaxLog2Ptr = launchParams.bmm1_scale_ptr + ? reinterpret_cast(launchParams.bmm1_scale_ptr + kIdxScaleSoftmaxLog2Ptr) + : nullptr; + } tllmRunnerParams.oSfScalePtr = params.fp4_out_sf_scale; tllmRunnerParams.oPtr = params.output; From 12727ebd7f1d7a24894a2b7f1a2da1c173ac0081 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Tue, 16 Dec 2025 18:54:32 +0800 Subject: [PATCH 169/172] [None][infra] Waive failed test for main branch on 12/16 (#10029) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 36711424f6..0d5cee3216 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -466,3 +466,9 @@ examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/57 examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293) examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293) disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5741884) +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype SKIP (https://nvbugs/5744427) +test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5744432) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740087) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) From 6a238ca8ade4ac645e22d4fa72d5d7f1290c567b Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Tue, 16 Dec 2025 18:58:43 +0800 Subject: [PATCH 170/172] [None][doc] Update CONTRIBUTING.md (#10023) Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- CONTRIBUTING.md | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d5b42a6a3..e215f3d021 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,33 +8,9 @@ ## Coding Guidelines -* Coding style for TensorRT-LLM can be found [in this document](CODING_GUIDELINES.md). +TensorRT-LLM Coding Style can be found [in this document](CODING_GUIDELINES.md). -* All contributed C++ code should be formatted following the rules in TensorRT-LLM's [clang-format](.clang-format) file. The recommended version is clang-format>=14.0. - -* Changes can be formatted with the following command: - - ```bash - # Commit ID is optional - if unspecified, run format on staged changes. - git-clang-format --style file [commit ID/reference] - ``` - -* All contributed Python code should be formatted using the `black` Python package. The recommended version is `black>=23.0` - -* Changes can be formatted with the following command: - - ```bash - git diff --name-only | grep "*.py" | xargs black -l 120 - ``` - -* Try to keep pull requests (PRs) as concise as possible: - * Avoid committing commented-out code. - * Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes. - -## Coding Style - -We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local -Python environment. +We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local Python environment. ```bash pip install pre-commit @@ -73,6 +49,9 @@ mdformat.................................................................Passed If any files were modified by this hook, you will need to stage and commit them again. +In addition, please try to keep pull requests (PRs) as concise as possible: +* Avoid committing commented-out code. +* Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes. ## Pull Requests From 609d1d03834c7248d3914380eb8ce5186601ecaa Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Tue, 16 Dec 2025 20:06:49 +0800 Subject: [PATCH 171/172] [None][fix] Fix Illegal Memory Access for CuteDSL Grouped GEMM (#10008) Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- ...guous_gather_grouped_gemm_swiglu_fusion.py | 56 +++++++-------- .../blockscaled_contiguous_grouped_gemm.py | 50 ++++++------- ...contiguous_grouped_gemm_finalize_fusion.py | 54 +++++++------- ...d_contiguous_grouped_gemm_swiglu_fusion.py | 50 ++++++------- .../dense_blockscaled_gemm_persistent.py | 58 +++++++-------- .../defs/accuracy/test_llm_api_pytorch.py | 71 ++++++++----------- 6 files changed, 165 insertions(+), 174 deletions(-) diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py index 2cd24ebe9c..3540f91550 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py @@ -273,7 +273,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], vectorized_f32: bool, - topk: int, + topk: cutlass.Int64, ): """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel with gather operation and SwiGLU fusion. @@ -310,7 +310,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: :param vectorized_f32: Enable vectorized f32x2 operations for better performance. :type vectorized_f32: bool :param topk: Number of experts selected per token (used for token ID mapping). - :type topk: int + :type topk: cutlass.Int64 """ self.sf_vec_size = sf_vec_size @@ -1083,7 +1083,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: gC_mnl = cute.local_tile( mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None) ) - k_tile_cnt = cute.size(gA_mkl, mode=[3]) + k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3])) # # Partition global tensor for TiledMMA_A/B/C @@ -2738,7 +2738,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the mma tiler and cluster shape are valid @@ -2750,7 +2750,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the mma tiler and cluster shape are valid, False otherwise :rtype: bool @@ -2803,10 +2803,10 @@ class BlockScaledContiguousGatherGroupedGemmKernel: @staticmethod def is_valid_tensor_alignment( - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 ab_dtype: Type[cutlass.Numeric], c_dtype: Type[cutlass.Numeric], a_major: str, @@ -2817,13 +2817,13 @@ class BlockScaledContiguousGatherGroupedGemmKernel: Check if the tensor alignment is valid :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param ab_dtype: The data type of the A and B operands :type ab_dtype: Type[cutlass.Numeric] :param c_dtype: The data type of the output tensor @@ -2863,14 +2863,14 @@ class BlockScaledContiguousGatherGroupedGemmKernel: c_dtype: Type[cutlass.Numeric], mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 a_major: str, b_major: str, c_major: str, - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the gemm can be implemented @@ -2892,13 +2892,13 @@ class BlockScaledContiguousGatherGroupedGemmKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param a_major: The major axis of the A tensor :type a_major: str :param b_major: The major axis of the B tensor @@ -2906,7 +2906,7 @@ class BlockScaledContiguousGatherGroupedGemmKernel: :param c_major: The major axis of the C tensor :type c_major: str :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the gemm can be implemented, False otherwise :rtype: bool @@ -2955,11 +2955,11 @@ class BlockScaledContiguousGatherGroupedGemmKernel: token_id_mapping_ptr: cute.Pointer, num_non_exiting_tiles_ptr: cute.Pointer, global_sf_ptr: cute.Pointer, - orig_m: int, - m: int, - n: int, - k: int, - l: int, # noqa: E741 + orig_m: cutlass.Int64, + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 tile_size: cutlass.Constexpr, scaling_vector_size: cutlass.Constexpr, max_active_clusters: cutlass.Constexpr, diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py index 1b1f21b3f9..b6ea02cf36 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py @@ -837,7 +837,7 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: gC_mnl = cute.local_tile( mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None) ) - k_tile_cnt = cute.size(gA_mkl, mode=[3]) + k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3])) # # Partition global tensor for TiledMMA_A/B/C @@ -2021,7 +2021,7 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the mma tiler and cluster shape are valid @@ -2033,7 +2033,7 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the mma tiler and cluster shape are valid, False otherwise :rtype: bool @@ -2086,10 +2086,10 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: @staticmethod def is_valid_tensor_alignment( - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 ab_dtype: Type[cutlass.Numeric], c_dtype: Type[cutlass.Numeric], a_major: str, @@ -2100,13 +2100,13 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: Check if the tensor alignment is valid :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param ab_dtype: The data type of the A and B operands :type ab_dtype: Type[cutlass.Numeric] :param c_dtype: The data type of the output tensor @@ -2148,14 +2148,14 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 a_major: str, b_major: str, c_major: str, - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the gemm can be implemented @@ -2177,13 +2177,13 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param a_major: The major axis of the A tensor :type a_major: str :param b_major: The major axis of the B tensor @@ -2191,7 +2191,7 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: :param c_major: The major axis of the C tensor :type c_major: str :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the gemm can be implemented, False otherwise :rtype: bool @@ -2233,10 +2233,10 @@ class Sm100BlockScaledContiguousGroupedGemmKernel: alpha_ptr: cute.Pointer, tile_idx_to_group_idx_ptr: cute.Pointer, num_non_exiting_tiles_ptr: cute.Pointer, - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 tile_size: cutlass.Constexpr, scaling_vector_size: cutlass.Constexpr, max_active_clusters: cutlass.Constexpr, diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py index 576c683b87..f556523b9f 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py @@ -998,7 +998,7 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: (None, None, None), ) - k_tile_cnt = cute.size(gA_mkl, mode=[3]) + k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3])) # # Partition global tensor for TiledMMA_A/B @@ -2030,7 +2030,7 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the mma tiler and cluster shape are valid @@ -2042,7 +2042,7 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the mma tiler and cluster shape are valid, False otherwise :rtype: bool @@ -2095,10 +2095,10 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: @staticmethod def is_valid_tensor_alignment( - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 ab_dtype: Type[cutlass.Numeric], out_dtype: Type[cutlass.Numeric], a_major: str, @@ -2109,13 +2109,13 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: Check if the tensor alignment is valid :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param ab_dtype: The data type of the A and B operands :type ab_dtype: Type[cutlass.Numeric] :param out_dtype: The data type of the output tensor @@ -2157,14 +2157,14 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 a_major: str, b_major: str, c_major: str, - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the gemm can be implemented @@ -2186,13 +2186,13 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param a_major: The major axis of the A tensor :type a_major: str :param b_major: The major axis of the B tensor @@ -2200,7 +2200,7 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: :param c_major: The major axis of the C tensor :type c_major: str :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the gemm can be implemented, False otherwise :rtype: bool @@ -2245,12 +2245,12 @@ class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: permuted_idx_to_expanded_idx_ptr: cute.Pointer, num_non_exiting_tiles_ptr: cute.Pointer, token_final_scales_ptr: cute.Pointer, - m: int, - n: int, - k: int, - l: int, # noqa: E741 - num_tokens: int, - top_k: int, + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 + num_tokens: cutlass.Int64, + top_k: cutlass.Int64, tile_size: cutlass.Constexpr, scaling_vector_size: cutlass.Constexpr, max_active_clusters: cutlass.Constexpr, diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py index f16c62a417..12a37c31b8 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_swiglu_fusion.py @@ -991,7 +991,7 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: gC_mnl = cute.local_tile( mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None) ) - k_tile_cnt = cute.size(gA_mkl, mode=[3]) + k_tile_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3])) # # Partition global tensor for TiledMMA_A/B/C @@ -2405,7 +2405,7 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the mma tiler and cluster shape are valid @@ -2417,7 +2417,7 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the mma tiler and cluster shape are valid, False otherwise :rtype: bool @@ -2470,10 +2470,10 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: @staticmethod def is_valid_tensor_alignment( - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 ab_dtype: Type[cutlass.Numeric], c_dtype: Type[cutlass.Numeric], a_major: str, @@ -2484,13 +2484,13 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: Check if the tensor alignment is valid :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param ab_dtype: The data type of the A and B operands :type ab_dtype: Type[cutlass.Numeric] :param c_dtype: The data type of the output tensor @@ -2532,14 +2532,14 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: use_2cta_instrs: bool, mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 a_major: str, b_major: str, c_major: str, - m_aligned: int, + m_aligned: cutlass.Int64, ) -> bool: """ Check if the gemm can be implemented @@ -2561,13 +2561,13 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster :type cluster_shape_mn: Tuple[int, int] :param m: The number of rows in the A tensor - :type m: int + :type m: cutlass.Int64 :param n: The number of columns in the B tensor - :type n: int + :type n: cutlass.Int64 :param k: The number of columns in the A tensor - :type k: int + :type k: cutlass.Int64 :param l: The number of columns in the C tensor - :type l: int + :type l: cutlass.Int64 :param a_major: The major axis of the A tensor :type a_major: str :param b_major: The major axis of the B tensor @@ -2575,7 +2575,7 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: :param c_major: The major axis of the C tensor :type c_major: str :param m_aligned: The alignment requirement for group M dimension (default: 128) - :type m_aligned: int + :type m_aligned: cutlass.Int64 :return: True if the gemm can be implemented, False otherwise :rtype: bool @@ -2619,10 +2619,10 @@ class Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel: tile_idx_to_group_idx_ptr: cute.Pointer, num_non_exiting_tiles_ptr: cute.Pointer, global_sf_ptr: cute.Pointer, - m: int, - n: int, - k: int, - l: int, # noqa: E741 + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, # noqa: E741 tile_size: cutlass.Constexpr, scaling_vector_size: cutlass.Constexpr, max_active_clusters: cutlass.Constexpr, diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py index 44edab9b3f..913473cf20 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py @@ -757,7 +757,7 @@ class Sm100BlockScaledPersistentDenseGemmKernel: gC_mnl = cute.local_tile(mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)) - k_block_cnt = cute.size(gA_mkl, mode=[3]) + k_block_cnt = cutlass.Int32(cute.size(gA_mkl, mode=[3])) # # Partition global tensor for TiledMMA_A/B/C @@ -1910,10 +1910,10 @@ class Sm100BlockScaledPersistentDenseGemmKernel: @staticmethod def is_valid_tensor_alignment( - m: int, - n: int, - k: int, - l: int, + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, ab_dtype: Type[cutlass.Numeric], c_dtype: Type[cutlass.Numeric], a_major: str, @@ -1923,10 +1923,10 @@ class Sm100BlockScaledPersistentDenseGemmKernel: """Checks if the tensor dimensions are valid for memory alignment. Args: - m (int): The M dimension of the GEMM problem. - n (int): The N dimension of the GEMM problem. - k (int): The K dimension of the GEMM problem. - l (int): The batch dimension (L) of the GEMM problem. + m (cutlass.Int64): The M dimension of the GEMM problem. + n (cutlass.Int64): The N dimension of the GEMM problem. + k (cutlass.Int64): The K dimension of the GEMM problem. + l (cutlass.Int64): The batch dimension (L) of the GEMM problem. ab_dtype (Type[cutlass.Numeric]): Data type of operands A and B. c_dtype (Type[cutlass.Numeric]): Data type of the output tensor C. a_major (str): The major layout of tensor A ('k' or 'm'). @@ -1962,10 +1962,10 @@ class Sm100BlockScaledPersistentDenseGemmKernel: c_dtype: Type[cutlass.Numeric], mma_tiler_mn: Tuple[int, int], cluster_shape_mn: Tuple[int, int], - m: int, - n: int, - k: int, - l: int, + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + l: cutlass.Int64, a_major: str, b_major: str, c_major: str, @@ -1983,10 +1983,10 @@ class Sm100BlockScaledPersistentDenseGemmKernel: mma_tiler_mn (Tuple[int, int]): The (M, N) shape of the MMA tiler. cluster_shape_mn (Tuple[int, int]): The (M, N) shape of the CTA cluster. - m (int): The M dimension of the GEMM problem. - n (int): The N dimension of the GEMM problem. - k (int): The K dimension of the GEMM problem. - l (int): The batch dimension (L) of the GEMM problem. + m (cutlass.Int64): The M dimension of the GEMM problem. + n (cutlass.Int64): The N dimension of the GEMM problem. + k (cutlass.Int64): The K dimension of the GEMM problem. + l (cutlass.Int64): The batch dimension (L) of the GEMM problem. a_major (str): The major layout of tensor A ('k' or 'm'). b_major (str): The major layout of tensor B ('k' or 'n'). c_major (str): The major layout of tensor C ('n' or 'm'). @@ -2017,12 +2017,12 @@ class Sm100BlockScaledPersistentDenseGemmKernel: @cute.jit def wrapper( self, - m: cutlass.Int32, - n: cutlass.Int32, - k: cutlass.Int32, - sf_m: cutlass.Int32, - sf_n: cutlass.Int32, - sf_k: cutlass.Int32, + m: cutlass.Int64, + n: cutlass.Int64, + k: cutlass.Int64, + sf_m: cutlass.Int64, + sf_n: cutlass.Int64, + sf_k: cutlass.Int64, l: cutlass.Constexpr, a_ptr: cute.Pointer, b_ptr: cute.Pointer, @@ -2038,12 +2038,12 @@ class Sm100BlockScaledPersistentDenseGemmKernel: """Executes the wrapped GEMM kernel with dynamically shaped tensors. Args: - m (int): The M dimension of the GEMM problem. - n (int): The N dimension of the GEMM problem. - k (int): The K dimension of the GEMM problem. - sf_m (int): The M dimension of the scale factor tensor. - sf_n (int): The N dimension of the scale factor tensor. - sf_k (int): The K dimension of the scale factor tensor. + m (cutlass.Int64): The M dimension of the GEMM problem. + n (cutlass.Int64): The N dimension of the GEMM problem. + k (cutlass.Int64): The K dimension of the GEMM problem. + sf_m (cutlass.Int64): The M dimension of the scale factor tensor. + sf_n (cutlass.Int64): The N dimension of the scale factor tensor. + sf_k (cutlass.Int64): The K dimension of the scale factor tensor. l (cutlass.Constexpr): The batch dimension (L) of the GEMM problem. a_ptr (cute.Pointer): Pointer to the A tensor. b_ptr (cute.Pointer): Pointer to the B tensor. diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 483dff0598..40c4dad222 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1788,12 +1788,11 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"]) def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, torch_compile, mtp_nextn, moe_backend): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") - if moe_backend == "CUTEDSL" and get_sm_version() != 100: - pytest.skip(f"{moe_backend} backend supports SM 100 only") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") + if moe_backend == "CUTEDSL" and sm_version not in (100, 103): + pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( @@ -1903,12 +1902,11 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") - if moe_backend == "CUTEDSL" and get_sm_version() != 100: - pytest.skip(f"{moe_backend} backend supports SM 100 only") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") + if moe_backend == "CUTEDSL" and sm_version not in (100, 103): + pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp. @@ -2261,10 +2259,9 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): attention_dp, enable_lm_head_tp_in_adp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) pytorch_config = dict( @@ -2395,10 +2392,9 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) pytorch_config = dict( @@ -2744,10 +2740,9 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend, skip_indexer): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, @@ -2810,10 +2805,9 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, @@ -3538,10 +3532,9 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): torch_compile, ): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") torch_compile_config = TorchCompileConfig( enable_fullgraph=True, @@ -3765,10 +3758,9 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness): def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, moe_backend, eagle3): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -4913,10 +4905,9 @@ class TestMistralLarge3_675B(LlmapiAccuracyTestHarness): def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, moe_backend, eagle3): - if moe_backend == "TRTLLM" and (get_sm_version() == 120 - or get_sm_version() == 121): - pytest.skip( - "MOE TRTLLM backend does not support SM version 120 or 121") + sm_version = get_sm_version() + if moe_backend == "TRTLLM" and sm_version in (120, 121): + pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, From bd13957e7077eda71a5f17752f601368f83e6fca Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Tue, 16 Dec 2025 21:16:32 +0800 Subject: [PATCH 172/172] [TRTLLM-9181][feat] improve disagg-server prometheus metrics; synchronize workers' clocks when workers are dynamic (#9726) Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- tensorrt_llm/_utils.py | 18 +- tensorrt_llm/serve/disagg_auto_scaling.py | 24 +- tensorrt_llm/serve/openai_client.py | 27 ++- tensorrt_llm/serve/openai_disagg_server.py | 44 ++-- tensorrt_llm/serve/perf_metrics.py | 60 +++-- tensorrt_llm/serve/router.py | 50 ++-- .../defs/disaggregated/test_auto_scaling.py | 2 +- .../defs/disaggregated/test_disaggregated.py | 119 +--------- .../examples/serve/test_serve_negative.py | 9 +- tests/integration/defs/perf/utils.py | 41 +--- tests/integration/defs/pytest.ini | 2 +- tests/integration/defs/test_e2e.py | 15 ++ .../test_lists/qa/llm_function_multinode.txt | 2 + .../test_lists/test-db/l0_dgx_h100.yml | 1 + .../perf-sanity/run_benchmark_serve.py | 20 +- tests/test_common/__init__.py | 0 tests/test_common/http_utils.py | 29 +++ tests/test_common/perf_metrics_utils.py | 188 +++++++++++++++ .../apps/_test_disagg_serving_multi_nodes.py | 48 ++-- ...g_serving_multi_nodes_service_discovery.py | 220 ++++++++++++++++++ .../llmapi/apps/_test_openai_mmencoder.py | 34 +-- tests/unittest/llmapi/apps/openai_server.py | 110 +++++++-- .../apps/test_disagg_serving_perf_metrics.py | 219 +++++++++++++++++ tests/unittest/llmapi/apps/utils.py | 28 +++ tests/unittest/pytest.ini | 1 + 25 files changed, 987 insertions(+), 324 deletions(-) create mode 100644 tests/test_common/__init__.py create mode 100644 tests/test_common/http_utils.py create mode 100644 tests/test_common/perf_metrics_utils.py create mode 100644 tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes_service_discovery.py create mode 100644 tests/unittest/llmapi/apps/test_disagg_serving_perf_metrics.py diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index bb264c939c..d89f218345 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -473,10 +473,20 @@ def dim_resolve_negative(dim, ndim): return tuple(pos) -def get_free_port(): - with socket.socket() as sock: - sock.bind(("", 0)) - return sock.getsockname()[1] +def get_free_port() -> int: + return get_free_ports(1)[0] + + +def get_free_ports(num=1) -> List[int]: + sockets = [ + socket.socket(socket.AF_INET, socket.SOCK_STREAM) for _ in range(num) + ] + for s in sockets: + s.bind(('', 0)) + ports = [s.getsockname()[1] for s in sockets] + for s in sockets: + s.close() + return ports # mpi4py only exports MPI_COMM_TYPE_SHARED, so we define OMPI_COMM_TYPE_HOST here diff --git a/tensorrt_llm/serve/disagg_auto_scaling.py b/tensorrt_llm/serve/disagg_auto_scaling.py index 62a7b5bc40..292778ab5d 100644 --- a/tensorrt_llm/serve/disagg_auto_scaling.py +++ b/tensorrt_llm/serve/disagg_auto_scaling.py @@ -2,6 +2,7 @@ import asyncio import json import os import random +import socket import time from dataclasses import asdict, dataclass from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple @@ -29,6 +30,18 @@ def get_worker_key(name: str, role: ServerRole, worker_id: str = "") -> str: return f"{get_worker_key_prefix(name)}/{worker_id}" +def get_host_from_uri(uri: str) -> str: + return uri.split("://")[1].split(":")[0] + + +# Get the local ip address from a remote host, +# if remote host is not provided, use Google's public DNS server "8.8.8.8" +def get_local_ip(remote_host: str = "8.8.8.8") -> str: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect((remote_host, 80)) + return s.getsockname()[0] + + class DisaggClusterManager: """ The cluster manager is responsible for managing the workers in the cluster. @@ -238,18 +251,25 @@ class DisaggClusterWorker: It will send heartbeat to the cluster storage every heartbeat_interval_sec seconds. If the worker heartbeat fails, it will re-register itself. """ + LOCALHOST_IPS = ["localhost", "127.0.0.1", "0.0.0.0", "::1", + "::"] # nosec B104 def __init__(self, role: ServerRole, host: str, port: int, config: DisaggClusterConfig, storage: ClusterStorage): self._role = role - self._host = host self._port = port self._config = config self._cluster_storage = storage self._stop = False self._heartbeat_task = None self._last_heartbeat = 0 - self._worker_id = f"{role.name}-{host}:{port}-{int(time.time()*1000)}-{os.getpid()}-{random.randint(0, 1000):03}" + register_host = host + # if the host is localhost and the cluster uri is not localhost, use the hostname to register the worker + disagg_host = get_host_from_uri(self._config.cluster_uri) + if host in self.LOCALHOST_IPS and disagg_host not in self.LOCALHOST_IPS: + register_host = get_local_ip(disagg_host) + self._host = register_host + self._worker_id = f"{role.name}-{register_host}:{port}-{int(time.time()*1000)}-{os.getpid()}-{random.randint(0, 1000):03}" def __del__(self): try: diff --git a/tensorrt_llm/serve/openai_client.py b/tensorrt_llm/serve/openai_client.py index 48172ca666..951fba5a7d 100644 --- a/tensorrt_llm/serve/openai_client.py +++ b/tensorrt_llm/serve/openai_client.py @@ -183,6 +183,9 @@ class OpenAIHttpClient(OpenAIClient): yield response_dict # finish the request after the successful response await self._finish_request(request) + self._metrics_collector.complete_latency_seconds.observe( + get_steady_clock_now_in_seconds() - start_time + ) break # break and skip retries if the whole response is processed without exception except (aiohttp.ClientError, OSError) as e: if lines_yielded > 0: @@ -227,25 +230,24 @@ class OpenAIHttpClient(OpenAIClient): i = 0 async for line in http_response.content.iter_any(): now_time = get_steady_clock_now_in_seconds() - if i == 0: - if hooks: - hooks.on_first_token(server, request) - self._metrics_collector.first_token_latency_seconds.observe( - now_time - last_token_time - ) - else: - self._metrics_collector.per_token_latency_seconds.observe( - now_time - last_token_time - ) - i += 1 if line: + if i == 0: + if hooks: + hooks.on_first_token(server, request) + self._metrics_collector.first_token_latency_seconds.observe( + now_time - last_token_time + ) + else: + self._metrics_collector.per_token_latency_seconds.observe( + now_time - last_token_time + ) + i += 1 yield line await asyncio.sleep(0) last_token_time = now_time if hooks: hooks.on_resp_done(server, request, None) - self._metrics_collector.completed_requests.inc() self._metrics_collector.complete_latency_seconds.observe( get_steady_clock_now_in_seconds() - start_time ) @@ -262,6 +264,7 @@ class OpenAIHttpClient(OpenAIClient): await self._finish_request(request) async def _finish_request(self, request: UCompletionRequest) -> None: + self._metrics_collector.completed_requests.inc() await self._router.finish_request(request) async def collect_metrics(self) -> Dict[str, Any]: diff --git a/tensorrt_llm/serve/openai_disagg_server.py b/tensorrt_llm/serve/openai_disagg_server.py index 524dd9fd11..7639e405a5 100644 --- a/tensorrt_llm/serve/openai_disagg_server.py +++ b/tensorrt_llm/serve/openai_disagg_server.py @@ -57,11 +57,12 @@ class RawRequestResponseHooks(ResponseHooks): self.raw_req = raw_req self.ctx_server = "" self.gen_server = "" + self.request_arrival_time = raw_req.state.server_arrival_time self.server_first_token_time = 0 self.perf_metrics_collector = perf_metrics_collector def on_req_begin(self, request: UCompletionRequest): - ... + self.perf_metrics_collector.queue_latency_seconds.observe(get_steady_clock_now_in_seconds() - self.request_arrival_time) def on_ctx_resp(self, ctx_server: str, response: UCompletionResponse): self.ctx_server = ctx_server @@ -93,8 +94,8 @@ class OpenAIDisaggServer: self._metrics_interval_secs = metrics_interval_secs self._ctx_servers, self._gen_servers = get_ctx_gen_server_addrs(config.server_configs) - self._ctx_router = create_router(config.ctx_router_config, self._ctx_servers, metadata_server_cfg, create_metadata_server(metadata_server_cfg)) - self._gen_router = create_router(config.gen_router_config, self._gen_servers, metadata_server_cfg, create_metadata_server(metadata_server_cfg)) + self._ctx_router = create_router(config.ctx_router_config, self._ctx_servers, metadata_server_cfg, create_metadata_server(metadata_server_cfg), self._sync_server_clock) + self._gen_router = create_router(config.gen_router_config, self._gen_servers, metadata_server_cfg, create_metadata_server(metadata_server_cfg), self._sync_server_clock) self._metadata_server = create_metadata_server(metadata_server_cfg) self._perf_metrics_collector = DisaggPerfMetricsCollector(config.perf_metrics_max_requests) @@ -122,8 +123,10 @@ class OpenAIDisaggServer: @asynccontextmanager async def lifespan(app) -> None: + # Prepare servers (sync server clock) when static ctx/gen server list is used + await self._ctx_router.prepare_servers() + await self._gen_router.prepare_servers() await self._service.setup() - await self._set_steady_clock_offsets() yield await self._service.teardown() @@ -133,6 +136,7 @@ class OpenAIDisaggServer: @self.app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): + self._perf_metrics_collector.validation_exceptions.inc() return JSONResponse(status_code=400, content={"error": str(exc)}) self.register_routes() @@ -158,8 +162,14 @@ class OpenAIDisaggServer: def _wrap_entry_point(self, entry_point: Callable) -> Callable: async def wrapper(req: UCompletionRequest, raw_req: Request) -> Response: try: + self._perf_metrics_collector.total_requests.inc() + if req.stream: + self._perf_metrics_collector.stream_requests.inc() + else: + self._perf_metrics_collector.nonstream_requests.inc() hooks = RawRequestResponseHooks(raw_req, self._perf_metrics_collector) response_or_generator = await entry_point(req, hooks) + self._perf_metrics_collector.total_responses.inc() if req.stream: return StreamingResponse(content=response_or_generator, media_type="text/event-stream") else: @@ -173,9 +183,11 @@ class OpenAIDisaggServer: logger.error("CppExecutorError: ", traceback.format_exc()) signal.raise_signal(signal.SIGINT) elif isinstance(exception, HTTPException): + self._perf_metrics_collector.http_exceptions.inc() logger.error(f"HTTPException {exception.status_code} {exception.detail}: ", traceback.format_exc()) raise exception else: + self._perf_metrics_collector.internal_errors.inc() logger.error("Internal server error: ", traceback.format_exc()) raise HTTPException(status_code=500, detail=f"Internal server error {str(exception)}") @@ -199,13 +211,12 @@ class OpenAIDisaggServer: timeout_keep_alive=TIMEOUT_KEEP_ALIVE) await uvicorn.Server(config).serve(sockets=sockets) - # TODO: rework this for service discovery, now it's only for static server list - async def _set_steady_clock_offsets(self): - STEADY_CLOCK_OFFSET_ENDPOINT = "/steady_clock_offset" + async def _sync_server_clock(self, server: str): + """ Sync the ctx/gen server's steady clock with the disagg-server's steady clock (in case NTP service is not running). """ async def query_steady_clock_offset(session: aiohttp.ClientSession, server_url: str) -> tuple[Optional[float], Optional[float]]: try: originate_ts = get_steady_clock_now_in_seconds() - async with session.get(server_url + STEADY_CLOCK_OFFSET_ENDPOINT) as response: + async with session.get(server_url) as response: destination_ts = get_steady_clock_now_in_seconds() if response.status == 200: response_content = await response.json() @@ -222,12 +233,11 @@ class OpenAIDisaggServer: async def set_steady_clock_offset(session: aiohttp.ClientSession, server_url: str, offset: float) -> None: payload = {"offset": offset} - async with session.post(server_url + STEADY_CLOCK_OFFSET_ENDPOINT, json=payload) as response: + async with session.post(server_url, json=payload) as response: if response.status != 200: logger.warning(f"Cannot set disagg server steady clock offset for server {server_url}, the perf metrics timestamps could be mis-aligned") async def align_steady_clock_offset(session: aiohttp.ClientSession, server_url: str) -> None: - server_url = f"http://{server_url}" if not server_url.startswith("http://") else server_url delay, offset = await query_steady_clock_offset(session, server_url) if delay is None or offset is None: logger.warning(f"Unable to measure steady clock offset for {server_url}; skipping adjustment") @@ -236,7 +246,13 @@ class OpenAIDisaggServer: # Negate the offset so that worker servers can adjust their steady clock by adding the new offset await set_steady_clock_offset(session, server_url, -offset) - async with aiohttp.ClientSession( - connector=aiohttp.TCPConnector(limit=0, limit_per_host=0, force_close=True), - timeout=aiohttp.ClientTimeout(total=self._req_timeout_secs)) as session: - await asyncio.gather(*[align_steady_clock_offset(session, server_url) for server_url in self._ctx_servers + self._gen_servers]) + server_scheme = "http://" if not server.startswith("http://") else "" + server_url = f"{server_scheme}{server}/steady_clock_offset" + + try: + async with aiohttp.ClientSession( + connector=aiohttp.TCPConnector(limit=0, limit_per_host=0, force_close=True), + timeout=aiohttp.ClientTimeout(total=self._req_timeout_secs)) as session: + await align_steady_clock_offset(session, server_url) + except (aiohttp.ClientError, OSError) as e: + logger.warning(f"Unable to align steady clock offset for {server_url}: {e}; skipping adjustment") diff --git a/tensorrt_llm/serve/perf_metrics.py b/tensorrt_llm/serve/perf_metrics.py index 60b65179ea..a8279e6ced 100644 --- a/tensorrt_llm/serve/perf_metrics.py +++ b/tensorrt_llm/serve/perf_metrics.py @@ -15,7 +15,7 @@ import asyncio from collections import defaultdict, deque from dataclasses import dataclass -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional from tensorrt_llm.llmapi.disagg_utils import ServerRole @@ -64,7 +64,7 @@ class MetricsDefinition: buckets: Optional[List[float]] = None -METRICS_DEFINITIONS = [ +CLIENT_METRICS_DEFINITIONS = [ MetricsDefinition("total_requests", "Total number of requests", "counter"), MetricsDefinition("error_requests", "Total number of error requests", "counter"), MetricsDefinition("retry_requests", "Total number of retry requests", "counter"), @@ -96,23 +96,29 @@ ROLE_TO_CLIENT_TYPE = { } +def instance_metric(definition: MetricsDefinition, role: Optional[ServerRole] = None): + # import lazily to avoid breaking `set_prometheus_multiproc_dir` + from prometheus_client import Counter, Histogram + + name = ( + f"{ROLE_TO_CLIENT_TYPE[role]}_{definition.name}" + if role in ROLE_TO_CLIENT_TYPE + else definition.name + ) + if definition.type == "counter": + return Counter(name, definition.description) + elif definition.type == "histogram": + return Histogram(name, definition.description, buckets=definition.buckets) + else: + raise ValueError(f"Invalid metric type: {definition.type}") + + class ClientMetricsCollector: def __init__(self, role: ServerRole): self._role = role - # import lazily to avoid breaking `set_prometheus_multiproc_dir` - from prometheus_client import Counter, Histogram - - def instance_metric(definition: MetricsDefinition) -> Union[Counter | Histogram]: - name = f"{ROLE_TO_CLIENT_TYPE[role]}_{definition.name}" - if definition.type == "counter": - return Counter(name, definition.description) - elif definition.type == "histogram": - return Histogram(name, definition.description, buckets=definition.buckets) - else: - raise ValueError(f"Invalid metric type: {definition.type}") - self._metrics = { - definition.name: instance_metric(definition) for definition in METRICS_DEFINITIONS + definition.name: instance_metric(definition, role) + for definition in CLIENT_METRICS_DEFINITIONS } def __getattr__( @@ -121,6 +127,23 @@ class ClientMetricsCollector: return self._metrics[key] +SERVER_METRICS_DEFINITIONS = [ + MetricsDefinition("total_requests", "Total number of requests", "counter"), + MetricsDefinition("stream_requests", "Total number of stream requests", "counter"), + MetricsDefinition("nonstream_requests", "Total number of non-stream requests", "counter"), + MetricsDefinition("validation_exceptions", "Total number of validation exceptions", "counter"), + MetricsDefinition("http_exceptions", "Total number of HTTP exceptions", "counter"), + MetricsDefinition("internal_errors", "Total number of internal errors", "counter"), + MetricsDefinition("total_responses", "Total number of responses", "counter"), + MetricsDefinition( + "queue_latency_seconds", + "Histogram of latency from request arrival to being processed in seconds", + "histogram", + SHORT_TIME_BUCKETS, + ), +] + + class DisaggPerfMetricsCollector: def __init__(self, max_requests: int): self._max_requests = max_requests @@ -128,10 +151,17 @@ class DisaggPerfMetricsCollector: self._server_metrics = defaultdict(dict) self._lock = asyncio.Lock() self._clients = [] + self._metrics = { + definition.name: instance_metric(definition) + for definition in SERVER_METRICS_DEFINITIONS + } def add_client(self, client): self._clients.append(client) + def __getattr__(self, key: str): + return self._metrics[key] + async def add_per_request_metrics( self, ctx_server: str, diff --git a/tensorrt_llm/serve/router.py b/tensorrt_llm/serve/router.py index a56255dd25..a3d3939886 100644 --- a/tensorrt_llm/serve/router.py +++ b/tensorrt_llm/serve/router.py @@ -1,7 +1,7 @@ import asyncio import heapq from abc import ABC, abstractmethod -from typing import Dict, Iterable, List, Optional, Union +from typing import Awaitable, Callable, Dict, Iterable, List, Optional, Union import aiohttp from transformers import AutoTokenizer @@ -145,9 +145,15 @@ class KvCacheAwareServerState(ServerState): class Router(ABC): - def __init__(self, server_role: ServerRole, servers: List[str], - metadata_server_cfg: Optional[MetadataServerConfig], - metadata_server: Optional[JsonDictionary]): + def __init__( + self, + server_role: ServerRole, + servers: List[str], + metadata_server_cfg: Optional[MetadataServerConfig], + metadata_server: Optional[JsonDictionary], + server_preparation_func: Optional[Callable[[str], + Awaitable[None]]] = None, + **kwargs): self._servers = servers or [] self._metadata_server = metadata_server self._server_role = server_role @@ -155,6 +161,7 @@ class Router(ABC): self._monitor_task = None self._session = None self._health_check_timeout = metadata_server_cfg.health_check_timeout if metadata_server_cfg else None + self._server_preparation_func = server_preparation_func @abstractmethod def _on_servers_updated(self, old_servers, new_servers): @@ -169,16 +176,26 @@ class Router(ABC): def servers(self) -> List[str]: return self._servers + async def _prepare_server(self, server: str): + if self._server_preparation_func: + await self._server_preparation_func(server) + + async def prepare_servers(self, servers: Optional[List[str]] = None): + for server in servers or self._servers: + await self._prepare_server(server) + async def add_server(self, server: str): if server in self._servers: logger.warning(f"Server {server} already exists") return + await self._prepare_server(server) async with self._lock: old_servers = self._servers.copy() self._servers = [*old_servers, server] self._on_servers_updated(old_servers, self._servers) logger.debug( - f"Added server {server}, current server list: {self._servers}") + f"Added server {server}, {self._server_role.name} current server list: {self._servers}" + ) async def remove_server(self, server: str): if server not in self._servers: @@ -275,6 +292,7 @@ class Router(ABC): # Log added servers for server in final_servers: if server not in old_servers: + await self._prepare_server(server) logger.info(f"Server {server} is added") else: logger.debug( @@ -419,7 +437,7 @@ class RoundRobinRouter(Router): metadata_server: JsonDictionary = None, **kwargs): super().__init__(server_role, servers, metadata_server_cfg, - metadata_server) + metadata_server, **kwargs) self._server_idx = 0 def _on_servers_updated(self, old_servers, new_servers): @@ -463,7 +481,7 @@ class LoadBalancingRouter(Router): use_tokens: bool = False, **kwargs): super().__init__(server_role, servers, metadata_server_cfg, - metadata_server) + metadata_server, **kwargs) # Load map between servers and their number of tokens processed self._server_state = {} self._server_load_heap = [] @@ -550,7 +568,7 @@ class KvCacheAwareRouter(Router): tokens_per_block: int = 32, **kwargs): super().__init__(server_role, servers, metadata_server_cfg, - metadata_server) + metadata_server, **kwargs) self._lock = asyncio.Lock() self._use_tokens = use_tokens @@ -647,10 +665,13 @@ class KvCacheAwareRouter(Router): self._server_state.pop(old_server, None) -def create_router(router_config: Optional[RouterConfig], - servers: Optional[List[str]], - metadata_server_cfg: Optional[MetadataServerConfig] = None, - metadata_server: Optional[JsonDictionary] = None) -> Router: +def create_router( + router_config: Optional[RouterConfig], + servers: Optional[List[str]], + metadata_server_cfg: Optional[MetadataServerConfig] = None, + metadata_server: Optional[JsonDictionary] = None, + server_preparation_func: Optional[Callable[[str], Awaitable[None]]] = None +) -> Router: """ Factory function to create different types of router instances. @@ -681,5 +702,8 @@ def create_router(router_config: Optional[RouterConfig], extra_args = router_config.args if router_config else {} return router_class(router_config.server_role if router_config else None, - servers, metadata_server_cfg, metadata_server, + servers, + metadata_server_cfg, + metadata_server, + server_preparation_func=server_preparation_func, **extra_args) diff --git a/tests/integration/defs/disaggregated/test_auto_scaling.py b/tests/integration/defs/disaggregated/test_auto_scaling.py index b96a6b3615..a3f4db28cb 100644 --- a/tests/integration/defs/disaggregated/test_auto_scaling.py +++ b/tests/integration/defs/disaggregated/test_auto_scaling.py @@ -154,7 +154,7 @@ def _run_worker(model_name, worker_config, role, port, work_dir, device=-1): env = os.environ.copy() if device != -1: env["CUDA_VISIBLE_DEVICES"] = str(device) - log_path = os.path.join(work_dir, f"output_{role}.log") + log_path = os.path.join(work_dir, f"output_{role}_{port}.log") log_file = open(log_path, "w+") print(f"Running {role} on port {port}") return ProcessWrapper(subprocess.Popen(cmd, diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 1c43dd50e2..a0d325c737 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -27,6 +27,8 @@ from defs.common import (revise_disagg_config_file_with_free_ports, from defs.conftest import (get_sm_version, llm_models_root, skip_arm, skip_no_hopper) from defs.trt_test_alternative import check_call, check_output, popen +from test_common.perf_metrics_utils import (get_timing_metrics, + validate_timing_metrics) from tensorrt_llm._utils import get_free_port, mpi_disabled from tensorrt_llm.logger import logger @@ -41,112 +43,6 @@ def cleanup_output_files(): pass -def validate_timing_metrics(perf_metrics_item, request_context=""): - """ - Helper function to validate timing metrics relationships. - - Args: - perf_metrics_item: A single performance metrics item from the /perf_metrics endpoint - request_context: String context for error messages (e.g., "request 1", "streaming") - """ - # Validate basic structure - required_keys = [ - "ctx_server", "gen_server", "ctx_perf_metrics", "gen_perf_metrics", - "disagg_server_arrival_time", "disagg_server_first_token_time" - ] - for key in required_keys: - assert key in perf_metrics_item, f"Missing key: {key} in {request_context}" - - assert perf_metrics_item["ctx_perf_metrics"][ - "ctx_request_id"] == perf_metrics_item["gen_perf_metrics"][ - "ctx_request_id"] - - # Extract timing metrics - ctx_metrics = perf_metrics_item["ctx_perf_metrics"]["perf_metrics"][ - "timing_metrics"] - gen_metrics = perf_metrics_item["gen_perf_metrics"]["perf_metrics"][ - "timing_metrics"] - disagg_arrival = perf_metrics_item["disagg_server_arrival_time"] - disagg_first_token = perf_metrics_item["disagg_server_first_token_time"] - - # Validate disaggregated server timing metrics - assert disagg_arrival is not None, f"disagg_server_arrival_time is None in {request_context}" - assert disagg_first_token is not None, f"disagg_server_first_token_time is None in {request_context}" - assert isinstance( - disagg_arrival, - (int, float - )), f"disagg_server_arrival_time is not numeric in {request_context}" - assert isinstance( - disagg_first_token, (int, float) - ), f"disagg_server_first_token_time is not numeric in {request_context}" - assert disagg_arrival > 0, f"disagg_server_arrival_time is not positive in {request_context}" - assert disagg_first_token > 0, f"disagg_server_first_token_time is not positive in {request_context}" - assert disagg_arrival <= disagg_first_token, f"disagg_server_arrival_time > disagg_server_first_token_time in {request_context}" - - # Validate server-level timing metrics for context server - ctx_server_arrival = ctx_metrics.get("server_arrival_time") - ctx_server_first_token = ctx_metrics.get("server_first_token_time") - assert ctx_server_arrival is not None, f"ctx server_arrival_time is None in {request_context}" - assert ctx_server_first_token is not None, f"ctx server_first_token_time is None in {request_context}" - assert isinstance( - ctx_server_arrival, - (int, - float)), f"ctx server_arrival_time is not numeric in {request_context}" - assert isinstance( - ctx_server_first_token, - (int, float - )), f"ctx server_first_token_time is not numeric in {request_context}" - assert ctx_server_arrival <= ctx_server_first_token, f"ctx server_arrival_time > server_first_token_time in {request_context}" - assert ctx_metrics["last_token_time"] - ctx_server_first_token < 1e-3 - - # Validate server-level timing metrics for generation server - gen_server_arrival = gen_metrics.get("server_arrival_time") - gen_server_first_token = gen_metrics.get("server_first_token_time") - assert gen_server_arrival is not None, f"gen server_arrival_time is None in {request_context}" - assert gen_server_first_token is not None, f"gen server_first_token_time is None in {request_context}" - assert isinstance( - gen_server_arrival, - (int, - float)), f"gen server_arrival_time is not numeric in {request_context}" - assert isinstance( - gen_server_first_token, - (int, float - )), f"gen server_first_token_time is not numeric in {request_context}" - assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}" - - # Network Time Protocol can ensure ms-level accuracy in LAN - ntp_tolerance = 1e-3 - - # Validate timing relationships between different levels - # Disaggregated server should receive request before individual servers - assert disagg_arrival - ntp_tolerance <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}" - assert disagg_arrival - ntp_tolerance <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}" - - # Context should complete before generation starts - assert ctx_server_first_token - ntp_tolerance <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}" - - # Validate internal timing consistency - ctx_arrival_time = ctx_metrics["arrival_time"] - ctx_first_token_time = ctx_metrics["first_token_time"] - gen_arrival_time = gen_metrics["arrival_time"] - gen_first_token_time = gen_metrics["first_token_time"] - - assert ctx_arrival_time <= ctx_first_token_time, f"ctx arrival_time > first_token_time in {request_context}" - assert gen_arrival_time <= gen_first_token_time, f"gen arrival_time > first_token_time in {request_context}" - - # Test KV cache transfer timing (if present) - if "kv_cache_transfer_start" in gen_metrics and "kv_cache_transfer_end" in gen_metrics: - kv_start = gen_metrics["kv_cache_transfer_start"] - kv_end = gen_metrics["kv_cache_transfer_end"] - assert gen_metrics["kv_cache_size"] > 0 - assert kv_start <= kv_end, f"kv_cache_transfer_start > kv_cache_transfer_end in {request_context}" - assert gen_arrival_time <= kv_start, f"gen_arrival_time > kv_cache_transfer_start in {request_context}" - assert kv_end <= gen_metrics[ - "first_scheduled_time"], f"kv_cache_transfer_end > first_scheduled_time in {request_context}" - - return True - - def get_disagg_server_url_from_cfg(config_file: str) -> tuple[str, int]: with open(config_file, 'r') as file: config = yaml.safe_load(file) @@ -828,16 +724,7 @@ def test_disaggregated_perf_metrics(disaggregated_test_root, llm_venv, os.symlink(src, dst, target_is_directory=True) def extra_endpoints_test(server_url: str): - import json - import urllib.request - - with urllib.request.urlopen(f"{server_url}/perf_metrics", - timeout=10) as resp: - assert resp.status == 200 - perf_metrics = json.load(resp) - assert len(perf_metrics) > 0 - item = perf_metrics[0] - + item = get_timing_metrics(server_url) # Use helper function to validate all timing metrics comprehensively validate_timing_metrics(item, "perf_metrics test") diff --git a/tests/integration/defs/examples/serve/test_serve_negative.py b/tests/integration/defs/examples/serve/test_serve_negative.py index 2b996b1502..dcfcb356bd 100644 --- a/tests/integration/defs/examples/serve/test_serve_negative.py +++ b/tests/integration/defs/examples/serve/test_serve_negative.py @@ -9,7 +9,6 @@ These tests verify that trtllm-serve handles error conditions gracefully: """ import asyncio -import socket import time from pathlib import Path @@ -19,11 +18,7 @@ import requests from defs.conftest import llm_models_root from defs.trt_test_alternative import popen, print_error, print_info - -def _find_free_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - return s.getsockname()[1] +from tensorrt_llm._utils import get_free_port class RemoteOpenAIServer: @@ -63,7 +58,7 @@ def server(model_path): """Start a test server for the module using popen like test_serve.py""" host_bind = "0.0.0.0" client_host = "localhost" - port = _find_free_port() + port = get_free_port() cmd = [ "trtllm-serve", "serve", diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py index a52ec84989..6e14592a37 100644 --- a/tests/integration/defs/perf/utils.py +++ b/tests/integration/defs/perf/utils.py @@ -31,6 +31,7 @@ from _pytest.nodes import Item from _pytest.python import Function from defs.trt_test_alternative import (check_output, popen, print_error, print_info) +from test_common.http_utils import wait_for_endpoint_ready from tensorrt_llm._utils import get_free_port @@ -251,29 +252,6 @@ class PerfAggrScriptTestCmds(NamedTuple): timeout: int output_dir: str - def wait_for_endpoint_ready(self, url: str, timeout: int = 7200): - start = time.monotonic() - while True: - elapsed_time = time.monotonic() - start - if elapsed_time > timeout: - print_error( - f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds" - ) - break - try: - print_info( - f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s" - ) - time.sleep(1) - if requests.get(url).status_code == 200: - print_info(f"endpoint {url} is ready") - return - except Exception as err: - print_info( - f"endpoint {url} is not ready, with exception: {err}") - print_error( - f"Endpoint {url} did not become ready within {timeout} seconds") - def run_cmd(self, cmd_idx: int, venv) -> str: output = "" server_proc = None @@ -294,7 +272,7 @@ class PerfAggrScriptTestCmds(NamedTuple): stderr=subprocess.STDOUT, env=copy.deepcopy(os.environ), ) - self.wait_for_endpoint_ready( + wait_for_endpoint_ready( f"http://{server_hostname}:{server_port}/health", timeout=self.timeout) client_cmd = add_host_port_to_cmd(self.client_cmds[cmd_idx], @@ -323,19 +301,6 @@ class PerfDisaggScriptTestCmds(NamedTuple): client_cmd: List[str] benchmark_cmd: List[str] - def wait_for_endpoint_ready(self, url: str, timeout: int = 600): - start = time.monotonic() - while time.monotonic() - start < timeout: - try: - time.sleep(1) - if requests.get(url).status_code == 200: - print(f"endpoint {url} is ready") - return - except Exception as err: - print(f"endpoint {url} is not ready, with exception: {err}") - print_error( - f"Endpoint {url} did not become ready within {timeout} seconds") - def run_cmd(self, cmd_idx: int, venv) -> str: output = "" try: @@ -360,7 +325,7 @@ class PerfDisaggScriptTestCmds(NamedTuple): stderr=subprocess.STDOUT, env=venv._new_env, shell=True) as server_proc): - self.wait_for_endpoint_ready( + wait_for_endpoint_ready( f"http://localhost:8000/health", timeout=1800) # 30 minutes for large models check_output(self.client_cmd, env=venv._new_env) diff --git a/tests/integration/defs/pytest.ini b/tests/integration/defs/pytest.ini index dcca875f03..a4b1c263f5 100644 --- a/tests/integration/defs/pytest.ini +++ b/tests/integration/defs/pytest.ini @@ -5,7 +5,7 @@ threadleak_exclude = asyncio_\d+ junit_family=legacy addopts = --ignore-glob="*perf/test_perf.py" --ignore-glob="*perf/disagg/*" --ignore-glob="*test_list_validation.py" --ignore-glob="*llm-test-workspace*" --durations=0 -W ignore::DeprecationWarning pythonpath = - ../../../examples/auto_deploy + ../../../examples/auto_deploy ../../ norecursedirs = ./triton/perf ./perf/disagg markers = skip_less_device: skip when less device detected than the declared diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 840e856b29..f1bd5315b3 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1769,6 +1769,21 @@ def test_trtllm_multimodal_benchmark_serving(llm_root, llm_venv): ]) +@pytest.mark.skip_less_device(4) +@pytest.mark.skip_less_device_memory(40000) +@pytest.mark.parametrize("service_discovery", ["etcd", "http"]) +def test_openai_disagg_multi_nodes_completion_service_discovery( + llm_root, llm_venv, service_discovery): + test_root = unittest_path() / "llmapi" / "apps" + llm_venv.run_cmd([ + "-m", + "pytest", + str(test_root / + f"_test_disagg_serving_multi_nodes_service_discovery.py::test_completion[{service_discovery}]" + ), + ]) + + @pytest.mark.skip_less_device(4) @pytest.mark.skip_less_device_memory(40000) @pytest.mark.parametrize("gen_config", diff --git a/tests/integration/test_lists/qa/llm_function_multinode.txt b/tests/integration/test_lists/qa/llm_function_multinode.txt index f2e3f8d216..c95bbb053c 100644 --- a/tests/integration/test_lists/qa/llm_function_multinode.txt +++ b/tests/integration/test_lists/qa/llm_function_multinode.txt @@ -11,3 +11,5 @@ test_e2e.py::test_multi_nodes_eval[Kimi-K2-Instruct-tp16-mmlu] test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp2pp1-gen_tp2pp1] test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] +test_e2e.py::test_openai_disagg_multi_nodes_completion_service_discovery[http] +test_e2e.py::test_openai_disagg_multi_nodes_completion_service_discovery[etcd] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index a239bf32d6..63817ed9af 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -42,6 +42,7 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-False-True] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-False] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] + - unittest/llmapi/apps/test_disagg_serving_perf_metrics.py # ------------- AutoDeploy tests --------------- - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] # llmapi diff --git a/tests/scripts/perf-sanity/run_benchmark_serve.py b/tests/scripts/perf-sanity/run_benchmark_serve.py index 34bca0d093..3f16f7273c 100644 --- a/tests/scripts/perf-sanity/run_benchmark_serve.py +++ b/tests/scripts/perf-sanity/run_benchmark_serve.py @@ -4,12 +4,11 @@ import ast import os import subprocess import sys -import time from pathlib import Path from typing import Dict, List, NamedTuple -import requests import yaml +from test_common.http_utils import wait_for_endpoint_ready def get_node_name() -> str: @@ -568,19 +567,6 @@ class PerfServerBenchmarkCmds(NamedTuple): names: List[str] working_dir: str - def wait_for_endpoint_ready(self, url: str, timeout: int = 5400): - start = time.monotonic() - while time.monotonic() - start < timeout: - try: - time.sleep(10) - if requests.get(url, timeout=5).status_code == 200: - print(f"endpoint {url} is ready") - return - except Exception as err: - print(f"endpoint {url} is not ready, with exception: {err}") - print_error( - f"Endpoint {url} did not become ready within {timeout} seconds") - def run_cmd(self, cmd_idx: int, node_name: str, @@ -601,8 +587,8 @@ class PerfServerBenchmarkCmds(NamedTuple): stderr=subprocess.STDOUT) # Wait for server to be ready - self.wait_for_endpoint_ready("http://localhost:8000/v1/models", - timeout=max_timeout) + wait_for_endpoint_ready("http://localhost:8000/v1/models", + timeout=max_timeout) # Save node name, gpu info, server config, client config output to server file path with open(client_file_path, 'w') as client_ctx: diff --git a/tests/test_common/__init__.py b/tests/test_common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_common/http_utils.py b/tests/test_common/http_utils.py new file mode 100644 index 0000000000..1f4aedce2b --- /dev/null +++ b/tests/test_common/http_utils.py @@ -0,0 +1,29 @@ +import time + +import requests + + +def wait_for_endpoint_ready(url: str, timeout: int = 300): + start = time.monotonic() + while time.monotonic() - start < timeout: + try: + time.sleep(1) + if requests.get(url, timeout=5).status_code == 200: + print(f"endpoint {url} is ready") + return + except Exception as err: + print(f"endpoint {url} is not ready, with exception: {err}") + raise RuntimeError(f"Endpoint {url} did not become ready within {timeout} seconds") + + +def wait_for_endpoint_down(url: str, timeout: int = 300): + start = time.monotonic() + while time.monotonic() - start < timeout: + try: + if requests.get(url, timeout=5).status_code >= 100: + print(f"endpoint {url} returned status code {requests.get(url).status_code}") + time.sleep(1) + except Exception as err: + print(f"endpoint {url} is down, with exception: {err}") + return + raise RuntimeError(f"Endpoint {url} did not become down within {timeout} seconds") diff --git a/tests/test_common/perf_metrics_utils.py b/tests/test_common/perf_metrics_utils.py new file mode 100644 index 0000000000..c63faa6d81 --- /dev/null +++ b/tests/test_common/perf_metrics_utils.py @@ -0,0 +1,188 @@ +import requests + + +def get_timing_metrics(server_url: str): + response = requests.get(f"{server_url}/perf_metrics", timeout=10) + assert response.status_code == 200 + perf_metrics = response.json() + assert len(perf_metrics) > 0 + return perf_metrics[0] + + +def validate_timing_metrics(perf_metrics_item, request_context="", time_tolerance_seconds=0.005): + """Helper function to validate timing metrics relationships. + + Args: + perf_metrics_item: A single performance metrics item from the /perf_metrics endpoint + request_context: String context for error messages (e.g., "request 1", "streaming") + """ + # Validate basic structure + required_keys = [ + "ctx_server", + "gen_server", + "ctx_perf_metrics", + "gen_perf_metrics", + "disagg_server_arrival_time", + "disagg_server_first_token_time", + ] + for key in required_keys: + assert key in perf_metrics_item, f"Missing key: {key} in {request_context}" + + assert ( + perf_metrics_item["ctx_perf_metrics"]["ctx_request_id"] + == perf_metrics_item["gen_perf_metrics"]["ctx_request_id"] + ) + + # Extract timing metrics + ctx_metrics = perf_metrics_item["ctx_perf_metrics"]["perf_metrics"]["timing_metrics"] + gen_metrics = perf_metrics_item["gen_perf_metrics"]["perf_metrics"]["timing_metrics"] + disagg_arrival = perf_metrics_item["disagg_server_arrival_time"] + disagg_first_token = perf_metrics_item["disagg_server_first_token_time"] + + # Validate disaggregated server timing metrics + assert disagg_arrival is not None, f"disagg_server_arrival_time is None in {request_context}" + assert disagg_first_token is not None, ( + f"disagg_server_first_token_time is None in {request_context}" + ) + assert isinstance(disagg_arrival, (int, float)), ( + f"disagg_server_arrival_time is not numeric in {request_context}" + ) + assert isinstance(disagg_first_token, (int, float)), ( + f"disagg_server_first_token_time is not numeric in {request_context}" + ) + assert disagg_arrival > 0, f"disagg_server_arrival_time is not positive in {request_context}" + assert disagg_first_token > 0, ( + f"disagg_server_first_token_time is not positive in {request_context}" + ) + assert disagg_arrival <= disagg_first_token, ( + f"disagg_server_arrival_time > disagg_server_first_token_time in {request_context}" + ) + + # Validate server-level timing metrics for context server + ctx_server_arrival = ctx_metrics.get("server_arrival_time") + ctx_server_first_token = ctx_metrics.get("server_first_token_time") + assert ctx_server_arrival is not None, f"ctx server_arrival_time is None in {request_context}" + assert ctx_server_first_token is not None, ( + f"ctx server_first_token_time is None in {request_context}" + ) + assert isinstance(ctx_server_arrival, (int, float)), ( + f"ctx server_arrival_time is not numeric in {request_context}" + ) + assert isinstance(ctx_server_first_token, (int, float)), ( + f"ctx server_first_token_time is not numeric in {request_context}" + ) + assert ctx_server_arrival <= ctx_server_first_token, ( + f"ctx server_arrival_time > server_first_token_time in {request_context}" + ) + assert ctx_metrics["last_token_time"] - ctx_server_first_token < 1e-3 + + # Validate server-level timing metrics for generation server + gen_server_arrival = gen_metrics.get("server_arrival_time") + gen_server_first_token = gen_metrics.get("server_first_token_time") + assert gen_server_arrival is not None, f"gen server_arrival_time is None in {request_context}" + assert gen_server_first_token is not None, ( + f"gen server_first_token_time is None in {request_context}" + ) + assert isinstance(gen_server_arrival, (int, float)), ( + f"gen server_arrival_time is not numeric in {request_context}" + ) + assert isinstance(gen_server_first_token, (int, float)), ( + f"gen server_first_token_time is not numeric in {request_context}" + ) + assert gen_server_arrival <= gen_server_first_token, ( + f"gen server_arrival_time > server_first_token_time in {request_context}" + ) + + # Validate timing relationships between different levels + # Disaggregated server should receive request before individual servers + # Allow some tolerance of a local network ping time when comparing the times from disagg and ctx/gen servers + # by taking consideration of the error of NTP (1/2 ping time). + assert disagg_arrival <= ctx_server_arrival + time_tolerance_seconds, ( + f"disagg_arrival {disagg_arrival} > ctx_server_arrival {ctx_server_arrival} in {request_context}" + ) + assert disagg_arrival <= gen_server_arrival + time_tolerance_seconds, ( + f"disagg_arrival {disagg_arrival} > gen_server_arrival {gen_server_arrival} in {request_context}" + ) + + # Context should complete before generation starts + assert ctx_server_first_token <= gen_server_arrival + time_tolerance_seconds, ( + f"ctx_server_first_token > gen_server_arrival in {request_context}" + ) + + # Validate internal timing consistency + ctx_arrival_time = ctx_metrics["arrival_time"] + ctx_first_token_time = ctx_metrics["first_token_time"] + gen_arrival_time = gen_metrics["arrival_time"] + gen_first_token_time = gen_metrics["first_token_time"] + + assert ctx_arrival_time <= ctx_first_token_time, ( + f"ctx arrival_time > first_token_time in {request_context}" + ) + assert gen_arrival_time <= gen_first_token_time, ( + f"gen arrival_time > first_token_time in {request_context}" + ) + + # Test KV cache transfer timing (if present) + if "kv_cache_transfer_start" in gen_metrics and "kv_cache_transfer_end" in gen_metrics: + kv_start = gen_metrics["kv_cache_transfer_start"] + kv_end = gen_metrics["kv_cache_transfer_end"] + assert gen_metrics["kv_cache_size"] > 0 + assert kv_start <= kv_end, ( + f"kv_cache_transfer_start > kv_cache_transfer_end in {request_context}" + ) + assert gen_arrival_time <= kv_start, ( + f"gen_arrival_time > kv_cache_transfer_start in {request_context}" + ) + assert kv_end <= gen_metrics["first_scheduled_time"], ( + f"kv_cache_transfer_end > first_scheduled_time in {request_context}" + ) + + return True + + +def get_prometheus_metrics(server_url: str): + response = requests.get(server_url + "/prometheus/metrics") + assert response.status_code == 200 + # Parse Prometheus metrics lines into a dictionary of {metric_name: value} + metrics = {} + print(response.text) + for line in response.text.split("\n"): + if line.startswith("#") or not line.strip(): + continue + parts = line.split() + if len(parts) < 2: + continue + metric = parts[0] + try: + value = float(parts[1]) + except ValueError: + continue + import re + + if bucket_match := re.match(r'(.+)_bucket\{le="([^"]+)"\}', metric): + # Try to parse bucket boundaries out of metrics like ..._bucket{le="0.005"} + base_metric, le_value = bucket_match.groups() + if base_metric not in metrics: + metrics[base_metric] = {} + try: + metrics[base_metric][float(le_value)] = value + except ValueError: + continue + elif sum_match := re.match(r"(.+)_sum$", metric): + base_metric = sum_match.groups()[0] + if base_metric not in metrics: + metrics[base_metric] = {} + metrics[base_metric]["sum"] = value + elif count_match := re.match(r"(.+)_count$", metric): + base_metric = count_match.groups()[0] + if base_metric not in metrics: + metrics[base_metric] = {} + metrics[base_metric]["count"] = value + elif total_match := re.match(r"(.+)_total$", metric): + base_metric = total_match.groups()[0] + print(f"Total metric {metric}: {base_metric} = {value}") + metrics[base_metric] = value + else: + # ignore prometheus built-in metrics + pass + return metrics diff --git a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py index 14ba1a160a..0c52852b9e 100644 --- a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py +++ b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py @@ -4,11 +4,15 @@ import time import openai import pytest -import requests +from test_common.http_utils import (wait_for_endpoint_down, + wait_for_endpoint_ready) +from test_common.perf_metrics_utils import (get_timing_metrics, + validate_timing_metrics) from ..test_llm import get_model_path from .openai_server import RemoteDisaggOpenAIServer, RemoteOpenAIServer -from .utils import expand_slurm_nodelist +from .utils import (expand_slurm_nodelist, wait_for_endpoint_down, + wait_for_endpoint_ready) RANK = int(os.environ.get("SLURM_PROCID", 0)) NODE_RANK = int(os.environ.get("SLURM_NODEID", 0)) @@ -19,7 +23,8 @@ pytestmark = pytest.mark.threadleak(enabled=False) # This test assumes that there are >2 nodes, we run ctx/disagg-server/client on the first node, # and run gen the second node. - +# This is a multi-node test, and will not be scheduled to the same node running other tests +# using fixed ports should be safe. CTX_SERVER_PORT = 8001 GEN_SERVER_PORT = 8002 DISAGG_SERVER_PORT = 8000 @@ -65,6 +70,7 @@ def env(): k: v for k, v in os.environ.items() if not ('PMI_' in k or 'OMPI_' in k or 'PMIX_' in k or 'SLURM_' in k) + and k not in ["UCX_TLS", "UCX_NET_DEVICES"] # avoid UCX failure on oci } @@ -105,6 +111,8 @@ def worker(model_name: str, ctx_tp_pp_size: tuple, gen_tp_pp_size: tuple): "enable_block_reuse": False, }, "disable_overlap_scheduler": True, + "perf_metrics_max_requests": 1000, + "return_perf_metrics": True, } if is_ctx_node(): print(f"starting ctx_server for rank {RANK} node rank {NODE_RANK}") @@ -138,32 +146,6 @@ def worker(model_name: str, ctx_tp_pp_size: tuple, gen_tp_pp_size: tuple): yield None -def wait_for_endpoint_ready(url: str, timeout: int = 300): - start = time.monotonic() - while time.monotonic() - start < timeout: - try: - time.sleep(1) - if requests.get(url).status_code == 200: - print(f"endpoint {url} is ready") - return - except Exception as err: - print(f"endpoint {url} is not ready, with exception: {err}") - - -def wait_for_endpoint_down(url: str, timeout: int = 300): - start = time.monotonic() - while time.monotonic() - start < timeout: - try: - if requests.get(url).status_code >= 100: - print( - f"endpoint {url} returned status code {requests.get(url).status_code}" - ) - time.sleep(1) - except Exception as err: - print(f"endpoint {url} is down, with exception: {err}") - return - - @pytest.fixture(scope="module") def disagg_server(worker: RemoteOpenAIServer): if is_disagg_node(): @@ -210,6 +192,14 @@ def test_completion(client: openai.OpenAI, assert completion.id is not None message = completion.choices[0].text assert message.startswith('2.') + + perf_metrics = get_timing_metrics(disagg_server.url_root) + # allow 5ms leniency when comparing the time points from disagg and ctx/gen servers + validate_timing_metrics(perf_metrics, + "multinode test_completion", + time_leniency_seconds=0.005) + # sleep 10 seconds to ensure a successful wait_for_endpoint_ready on rank1 + time.sleep(10) disagg_server.terminate() elif is_gen_node(): diff --git a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes_service_discovery.py b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes_service_discovery.py new file mode 100644 index 0000000000..dc0bb4396b --- /dev/null +++ b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes_service_discovery.py @@ -0,0 +1,220 @@ +import os +import shutil +import subprocess +import tempfile +import uuid + +import openai +import pytest +from test_common.perf_metrics_utils import get_timing_metrics, validate_timing_metrics + +from tensorrt_llm._utils import get_free_port +from tensorrt_llm.llmapi.disagg_utils import ServerRole + +from ..test_llm import get_model_path +from .openai_server import RemoteDisaggOpenAIServer, RemoteOpenAIServer +from .utils import expand_slurm_nodelist, wait_for_endpoint_down, wait_for_endpoint_ready + +RANK = int(os.environ.get("SLURM_PROCID", 0)) +NODE_RANK = int(os.environ.get("SLURM_NODEID", 0)) +NODE_LIST = expand_slurm_nodelist(os.environ.get("SLURM_NODELIST", "")) +SLURM_NTASKS_PER_NODE = int(os.environ.get("SLURM_NTASKS_PER_NODE", 1)) + +# This a multi-node QA test, use a fixed port instead of finding a free port +# so that all nodes can have the same disagg server config +DISAGG_SERVER_PORT = 8000 + + +# This test is supposed to run with 2 nodes or more +def is_ctx_node(): + assert len(NODE_LIST) == 2 + return NODE_RANK == 0 + + +def is_gen_node(): + assert len(NODE_LIST) == 2 + return NODE_RANK == 1 + + +def is_disagg_node(): + return NODE_RANK == 0 + + +# The test is run on multinodes but only the first node's output is used for assertion +def is_pytest_node(): + return NODE_RANK == 0 + + +def env(): + # Remove MPI related environment variables to isolate the ctx/gen processes + # so that they will not be in the same MPI communicator, otherwise the rank and world_size may mismatch + return { + k: v + for k, v in os.environ.items() + if not ("PMI_" in k or "OMPI_" in k or "PMIX_" in k or "SLURM_" in k) + and k not in ["UCX_TLS", "UCX_NET_DEVICES"] + } + + +@pytest.fixture +def model_name(): + return "llama-3.1-model/Llama-3.1-8B-Instruct" + + +@pytest.fixture +def disagg_host(): + return NODE_LIST[0] + + +@pytest.fixture(params=["etcd", "http"]) +def service_discovery(request, disagg_host: str): + if request.param == "etcd": + work_dir = tempfile.mkdtemp() + data_dir = f"{work_dir}/disagg_test-etcd-{uuid.uuid4()}" + etcd = subprocess.Popen(["etcd", "--data-dir", data_dir]) + yield etcd, f"etcd://{disagg_host}:2379" + try: + etcd.kill() + etcd.wait(timeout=10) + shutil.rmtree(data_dir) + except Exception: + pass + else: + yield None, f"http://{disagg_host}:{DISAGG_SERVER_PORT}" + + +@pytest.fixture +def disagg_cluster_config(service_discovery: tuple): + _, uri = service_discovery + return { + "cluster_uri": uri, + "cluster_name": "", + } + + +@pytest.fixture +def worker(model_name: str, disagg_cluster_config: dict): + extra_config = { + "disagg_cluster": disagg_cluster_config, + "cache_transceiver_config": {"backend": "DEFAULT"}, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5, + "enable_block_reuse": False, + }, + "disable_overlap_scheduler": True, + "return_perf_metrics": True, + "perf_metrics_max_requests": 1000, + } + # start workers on 0.0.0.0:, then the workers should be able to + # report their correct hostname:port to the disagg server + port = get_free_port() + if is_ctx_node(): + print(f"starting ctx_server for rank {RANK} node rank {NODE_RANK}") + model_path = get_model_path(model_name) + tp_size, pp_size = 1, 1 + args = ["--tp_size", str(tp_size), "--pp_size", str(pp_size)] + with RemoteOpenAIServer( + model_path, + port=port, + cli_args=args, + host="0.0.0.0", + env=env(), + llmapi_launch=False, + rank=RANK % SLURM_NTASKS_PER_NODE, + extra_config=extra_config, + role=ServerRole.CONTEXT, + ) as server: + yield server + elif is_gen_node(): + print(f"starting gen_server for rank {RANK} node rank {NODE_RANK}") + model_path = get_model_path(model_name) + tp_size, pp_size = 1, 1 + args = ["--tp_size", str(tp_size), "--pp_size", str(pp_size)] + with RemoteOpenAIServer( + model_path, + port=port, + cli_args=args, + host="0.0.0.0", + env=env(), + llmapi_launch=False, + rank=RANK % SLURM_NTASKS_PER_NODE, + extra_config=extra_config, + role=ServerRole.GENERATION, + ) as server: + yield server + else: + yield None + + +# different from non-service-discovery version, disagg server doesn't have to +# wait for ctx/gen servers to get ready +@pytest.fixture +def disagg_server(disagg_cluster_config: dict): + if is_disagg_node(): + disagg_config = { + "disagg_cluster": disagg_cluster_config, + "port": DISAGG_SERVER_PORT, + "hostname": "0.0.0.0", + "perf_metrics_max_requests": 1000, + } + print(f"starting disagg_server for rank {RANK} node rank {NODE_RANK}") + # ctx/gen servers are unnecessary for service discovery test + with RemoteDisaggOpenAIServer( + ctx_servers=[], + gen_servers=[], + port=DISAGG_SERVER_PORT, + disagg_config=disagg_config, + llmapi_launch=False, + env=env(), + wait_ready=False, # wait it to be ready in test body + ) as server: + yield server + else: + print(f"skipping disagg_server for rank {RANK} node rank {NODE_RANK}") + yield None + + +@pytest.fixture +def client(disagg_server: RemoteDisaggOpenAIServer): + if is_pytest_node(): + return disagg_server.get_client() + else: + print(f"skipping client for rank {RANK} node rank {NODE_RANK}") + return None + + +def test_completion( + disagg_server: RemoteDisaggOpenAIServer, + worker: RemoteOpenAIServer, + client: openai.OpenAI, + disagg_host: str, + model_name: str, +): + disagg_health_url = f"http://{disagg_host}:{DISAGG_SERVER_PORT}/health/" + wait_for_endpoint_ready(disagg_health_url) + if is_pytest_node(): + print(f"running test_completion on rank {RANK} node rank {NODE_RANK}") + prompt = "What is the result of 1+1? Answer in one word: " + for _ in range(10): + completion = client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=10, + temperature=0.0, + ) + print(f"Output: {completion.choices[0].text}") + assert completion.id is not None + message = completion.choices[0].text + assert message.startswith("2.") + + perf_metrics = get_timing_metrics(disagg_server.url_root) + validate_timing_metrics(perf_metrics, "multinode test_completion") + + disagg_server.terminate() + + elif is_gen_node(): + # keep gen workers alive until the test ends + wait_for_endpoint_down(disagg_health_url) + assert True + else: + assert True diff --git a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py index 15a1f66cd5..1ca1beec2a 100644 --- a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py +++ b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py @@ -1,6 +1,5 @@ import os import tempfile -from typing import List import openai import pytest @@ -8,42 +7,11 @@ import requests import yaml from ..test_llm import get_model_path -from .openai_server import RemoteOpenAIServer +from .openai_server import RemoteMMEncoderServer pytestmark = pytest.mark.threadleak(enabled=False) -class RemoteMMEncoderServer(RemoteOpenAIServer): - """Remote server for testing multimodal encoder endpoints.""" - - def __init__(self, - model: str, - cli_args: List[str] = None, - port: int = None) -> None: - # Reuse parent initialization but change the command - import subprocess - import sys - - from tensorrt_llm.llmapi.mpi_session import find_free_port - - self.host = "localhost" - self.port = port if port is not None else find_free_port() - self.rank = os.environ.get("SLURM_PROCID", 0) - - args = ["--host", f"{self.host}", "--port", f"{self.port}"] - if cli_args: - args += cli_args - - # Use mm_embedding_serve command instead of regular serve - launch_cmd = ["trtllm-serve", "mm_embedding_serve"] + [model] + args - - self.proc = subprocess.Popen(launch_cmd, - stdout=sys.stdout, - stderr=sys.stderr) - self._wait_for_server(url=self.url_for("health"), - timeout=self.MAX_SERVER_START_WAIT_S) - - @pytest.fixture(scope="module", ids=["Qwen2.5-VL-3B-Instruct"]) def model_name(): return "Qwen2.5-VL-3B-Instruct" diff --git a/tests/unittest/llmapi/apps/openai_server.py b/tests/unittest/llmapi/apps/openai_server.py index 15dd94eb47..ebbe0d5627 100644 --- a/tests/unittest/llmapi/apps/openai_server.py +++ b/tests/unittest/llmapi/apps/openai_server.py @@ -11,7 +11,8 @@ import openai import requests import yaml -from tensorrt_llm.llmapi.mpi_session import find_free_port +from tensorrt_llm._utils import get_free_port +from tensorrt_llm.llmapi.disagg_utils import ServerRole class RemoteOpenAIServer: @@ -26,13 +27,21 @@ class RemoteOpenAIServer: host: str = "localhost", env: Optional[dict] = None, rank: int = -1, - extra_config: Optional[dict] = None) -> None: + extra_config: Optional[dict] = None, + log_path: Optional[str] = None, + wait: bool = True, + role: Optional[ServerRole] = None) -> None: self.host = host - self.port = port if port is not None else find_free_port() + self.port = port if port is not None else get_free_port() self.rank = rank if rank != -1 else int( os.environ.get("SLURM_PROCID", 0)) self.extra_config_file = None + self.log_path = log_path + self.log_file = None + self.role = role args = ["--host", f"{self.host}", "--port", f"{self.port}"] + if self.role is not None: + args += ["--server_role", self.role.name] if cli_args: args += cli_args if extra_config: @@ -50,10 +59,19 @@ class RemoteOpenAIServer: env = os.environ.copy() self.proc = subprocess.Popen(launch_cmd, env=env, - stdout=sys.stdout, - stderr=sys.stderr) - self._wait_for_server(url=self.url_for("health"), - timeout=self.MAX_SERVER_START_WAIT_S) + stdout=self._get_output(), + stderr=self._get_output()) + if wait: + self.wait_for_server(timeout=self.MAX_SERVER_START_WAIT_S) + + def _get_output(self): + if self.log_file: + return self.log_file + elif self.log_path: + self.log_file = open(self.log_path, "w+") + return self.log_file + else: + return sys.stdout def __enter__(self): return self @@ -76,6 +94,12 @@ class RemoteOpenAIServer: except Exception as e: print(f"Error removing extra config file: {e}") self.proc = None + if self.log_file: + self.log_file.close() + self.log_file = None + + def wait_for_server(self, timeout: float): + self._wait_for_server(url=self.url_for("health"), timeout=timeout) def _wait_for_server(self, *, url: str, timeout: float): # run health check on the first rank only. @@ -128,21 +152,28 @@ class RemoteDisaggOpenAIServer(RemoteOpenAIServer): gen_servers: List[str], port: int = -1, env: Optional[dict] = None, - llmapi_launch: bool = False) -> None: + llmapi_launch: bool = False, + disagg_config: Optional[dict] = None, + log_path: Optional[str] = None, + wait_ready: bool = True) -> None: self.ctx_servers = ctx_servers self.gen_servers = gen_servers - self.host = "localhost" - self.port = find_free_port() if port is None or port < 0 else port + self.host = "0.0.0.0" + self.port = get_free_port() if port is None or port < 0 else port self.rank = 0 - with tempfile.NamedTemporaryFile(mode="w+", - delete=False, - delete_on_close=False) as f: - f.write(self._get_extra_config()) - f.flush() - self.extra_config_file = f.name + self.disagg_config = self._get_extra_config() + if disagg_config: + self.disagg_config.update(disagg_config) + self.log_path = log_path + self.log_file = None + self.extra_config_file = os.path.join( + tempfile.gettempdir(), f"disagg_config_{self.port}.yaml") + with open(self.extra_config_file, "w+") as f: + yaml.dump(self.disagg_config, f) launch_cmd = [ "trtllm-serve", "disaggregated", "-c", self.extra_config_file ] + print(f"launch_cmd: {launch_cmd}, extra_config: {self.disagg_config}") if llmapi_launch: # start server with llmapi-launch on multi nodes launch_cmd = ["trtllm-llmapi-launch"] + launch_cmd @@ -150,13 +181,14 @@ class RemoteDisaggOpenAIServer(RemoteOpenAIServer): env = os.environ.copy() self.proc = subprocess.Popen(launch_cmd, env=env, - stdout=sys.stdout, - stderr=sys.stderr) - self._wait_for_server(url=self.url_for("health"), - timeout=self.MAX_SERVER_START_WAIT_S) + stdout=self._get_output(), + stderr=self._get_output()) + if wait_ready: + self._wait_for_server(url=self.url_for("health"), + timeout=self.MAX_SERVER_START_WAIT_S) def _get_extra_config(self): - return yaml.dump({ + return { "context_servers": { "num_instances": len(self.ctx_servers), "urls": self.ctx_servers @@ -167,4 +199,38 @@ class RemoteDisaggOpenAIServer(RemoteOpenAIServer): }, "port": self.port, "hostname": self.host, - }) + "perf_metrics_max_requests": 1000, + } + + +class RemoteMMEncoderServer(RemoteOpenAIServer): + """Remote server for testing multimodal encoder endpoints.""" + + def __init__(self, + model: str, + cli_args: List[str] = None, + port: int = None, + log_path: Optional[str] = None) -> None: + # Reuse parent initialization but change the command + import subprocess + + from tensorrt_llm._utils import get_free_port + + self.host = "localhost" + self.port = port if port is not None else get_free_port() + self.rank = os.environ.get("SLURM_PROCID", 0) + self.log_path = log_path + self.log_file = None + + args = ["--host", f"{self.host}", "--port", f"{self.port}"] + if cli_args: + args += cli_args + + # Use mm_embedding_serve command instead of regular serve + launch_cmd = ["trtllm-serve", "mm_embedding_serve"] + [model] + args + + self.proc = subprocess.Popen(launch_cmd, + stdout=self._get_output(), + stderr=self._get_output()) + self._wait_for_server(url=self.url_for("health"), + timeout=self.MAX_SERVER_START_WAIT_S) diff --git a/tests/unittest/llmapi/apps/test_disagg_serving_perf_metrics.py b/tests/unittest/llmapi/apps/test_disagg_serving_perf_metrics.py new file mode 100644 index 0000000000..d8af28a491 --- /dev/null +++ b/tests/unittest/llmapi/apps/test_disagg_serving_perf_metrics.py @@ -0,0 +1,219 @@ +import os +from typing import Tuple + +import openai +import pytest +from test_common.http_utils import wait_for_endpoint_ready +from test_common.perf_metrics_utils import ( + get_prometheus_metrics, + get_timing_metrics, + validate_timing_metrics, +) + +from tensorrt_llm._utils import get_free_ports + +from ..test_llm import get_model_path +from .openai_server import RemoteDisaggOpenAIServer, RemoteOpenAIServer + + +@pytest.fixture +def test_ports(): + return get_free_ports(3) + + +@pytest.fixture +def disagg_port(test_ports: list[int]): + return test_ports[0] + + +@pytest.fixture +def ctx_port(test_ports: list[int]): + return test_ports[1] + + +@pytest.fixture +def gen_port(test_ports: list[int]): + return test_ports[2] + + +@pytest.fixture +def model_name(): + return "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + + +@pytest.fixture +def disagg_cluster_config(disagg_port: int): + return { + "cluster_uri": f"http://localhost:{disagg_port}", + "cluster_name": "", + } + + +def worker_config(model_name: str, disagg_cluster_config: dict): + return { + "model": model_name, + "disagg_cluster": disagg_cluster_config, + "cache_transceiver_config": { + "backend": "DEFAULT", + }, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.2, + "enable_block_reuse": False, + }, + "disable_overlap_scheduler": True, + "cuda_graph_config": None, + "return_perf_metrics": True, + "perf_metrics_max_requests": 1000, + } + + +@pytest.fixture +def workers(model_name: str, disagg_cluster_config: dict, ctx_port: int, gen_port: int): + model_path = get_model_path(model_name) + extra_config = worker_config(model_name, disagg_cluster_config) + + def worker(server_role: str, port: int): + return RemoteOpenAIServer( + model_path, + port=port, + env=os.environ.copy(), + cli_args=["--server_role", server_role], + llmapi_launch=False, + extra_config=extra_config, + log_path=f"output_{server_role}.log", + wait=False, + ) + + with worker("context", ctx_port) as ctx_worker, worker("generation", gen_port) as gen_worker: + yield ctx_worker, gen_worker + + +@pytest.fixture +def disagg_server(disagg_cluster_config: dict, workers, disagg_port: int): + disagg_config = { + "port": disagg_port, + "disagg_cluster": disagg_cluster_config, + "perf_metrics_max_requests": 1000, + } + with RemoteDisaggOpenAIServer( + ctx_servers=[], + gen_servers=[], + port=disagg_config["port"], + llmapi_launch=False, + disagg_config=disagg_config, + ) as server: + yield server + + +@pytest.fixture +def client(disagg_server: RemoteDisaggOpenAIServer): + return disagg_server.get_client() + + +@pytest.fixture +def async_client(disagg_server: RemoteDisaggOpenAIServer): + return disagg_server.get_async_client() + + +async def send_request( + client: openai.AsyncOpenAI, stream: bool, repeat: int, max_token: int, model_name: str +): + for _ in range(repeat): + prompt = "What is the result of 1+1? Answer in one word: " + completion = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=max_token, + temperature=0.0, + stream=stream, + ) + if stream: + output = [] + async for chunk in completion: + output.append(chunk.choices[0].text) + assert len(output) > 0 + message = "".join(output) + else: + assert completion.id is not None + message = completion.choices[0].text + assert message.startswith("2.") + + +def check_historgram(metrics_dict: dict, count: int, range: tuple[float, float]): + assert metrics_dict["count"] == count + mean = metrics_dict["sum"] / metrics_dict["count"] + assert mean > range[0] and mean < range[1] + + +@pytest.mark.asyncio +@pytest.mark.timeout(300) +async def test_completion_metrics( + async_client: openai.AsyncOpenAI, + workers: Tuple[RemoteOpenAIServer, RemoteOpenAIServer], + disagg_server: RemoteDisaggOpenAIServer, + model_name: str, +): + assert len(workers) == 2 + for worker in workers: + worker.wait_for_server(timeout=120) + wait_for_endpoint_ready(disagg_server.url_root + "/health") + + max_token = 10 + total_requests = 10 + await send_request( + client=async_client, + stream=True, + repeat=total_requests, + max_token=max_token, + model_name=model_name, + ) + timing_metrics = get_timing_metrics(disagg_server.url_root) + validate_timing_metrics(timing_metrics, "test_completion_metrics") + + metrics = get_prometheus_metrics(disagg_server.url_root) + print(metrics) + + for role in ["ctx", "gen"]: + assert metrics[f"{role}_total_requests"] == total_requests + assert metrics[f"{role}_completed_requests"] == total_requests + assert metrics[f"{role}_error_requests"] == 0 + assert f"{role}_retry_requests" in metrics + + check_historgram(metrics["gen_first_token_latency_seconds"], total_requests, (0.0, 0.3)) + check_historgram(metrics["gen_complete_latency_seconds"], total_requests, (0.0, 0.6)) + + assert metrics["total_requests"] == total_requests + assert metrics["stream_requests"] == total_requests + assert metrics["nonstream_requests"] == 0 + assert metrics["total_responses"] == total_requests + assert metrics["validation_exceptions"] == 0 + assert metrics["http_exceptions"] == 0 + assert metrics["internal_errors"] == 0 + check_historgram(metrics["queue_latency_seconds"], total_requests, (0.0, 0.03)) + + # test non streaming part + await send_request( + client=async_client, + stream=False, + repeat=total_requests, + max_token=max_token, + model_name=model_name, + ) + + metrics = get_prometheus_metrics(disagg_server.url_root) + for role in ["ctx", "gen"]: + assert metrics[f"{role}_total_requests"] == total_requests * 2 + assert metrics[f"{role}_completed_requests"] == total_requests * 2 + assert metrics[f"{role}_error_requests"] == 0 + assert f"{role}_retry_requests" in metrics + + assert metrics["total_requests"] == total_requests * 2 + assert metrics["stream_requests"] == total_requests + assert metrics["nonstream_requests"] == total_requests + assert metrics["total_responses"] == total_requests * 2 + assert metrics["validation_exceptions"] == 0 + assert metrics["http_exceptions"] == 0 + assert metrics["internal_errors"] == 0 + + check_historgram(metrics["gen_complete_latency_seconds"], total_requests * 2, (0.0, 0.6)) + check_historgram(metrics["queue_latency_seconds"], total_requests * 2, (0.0, 0.03)) diff --git a/tests/unittest/llmapi/apps/utils.py b/tests/unittest/llmapi/apps/utils.py index 2990c1b2db..783f6937bd 100644 --- a/tests/unittest/llmapi/apps/utils.py +++ b/tests/unittest/llmapi/apps/utils.py @@ -14,10 +14,12 @@ # limitations under the License. import re +import time from pathlib import Path from typing import Any, Callable import pytest +import requests import yaml from ..test_llm import get_model_path @@ -257,3 +259,29 @@ def expand_slurm_nodelist(nodelist_str): expanded_nodes.append(group) return expanded_nodes + + +def wait_for_endpoint_ready(url: str, timeout: int = 300, interval: int = 3): + start = time.monotonic() + while time.monotonic() - start < timeout: + try: + time.sleep(interval) + if requests.get(url).status_code == 200: + print(f"endpoint {url} is ready") + return + except Exception as err: + print(f"endpoint {url} is not ready, with exception: {err}") + + +def wait_for_endpoint_down(url: str, timeout: int = 300): + start = time.monotonic() + while time.monotonic() - start < timeout: + try: + if requests.get(url).status_code >= 100: + print( + f"endpoint {url} returned status code {requests.get(url).status_code}" + ) + time.sleep(1) + except Exception as err: + print(f"endpoint {url} is down, with exception: {err}") + return diff --git a/tests/unittest/pytest.ini b/tests/unittest/pytest.ini index 8690cf25df..ccd67fbbf5 100644 --- a/tests/unittest/pytest.ini +++ b/tests/unittest/pytest.ini @@ -9,6 +9,7 @@ pythonpath = ../../examples/auto_deploy ../../examples/models/core ../../examples + ../ env = D:AUTO_DEPLOY_LOG_LEVEL=INFO markers =